1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck -check-prefix=X32 %s 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck -check-prefix=X64 %s 4 5 ;; A basic sanity check to make sure that MMX arithmetic actually compiles. 6 ;; First is a straight translation of the original with bitcasts as needed. 7 8 define void @test0(x86_mmx* %A, x86_mmx* %B) { 9 ; X32-LABEL: test0: 10 ; X32: # %bb.0: # %entry 11 ; X32-NEXT: pushl %ebp 12 ; X32-NEXT: .cfi_def_cfa_offset 8 13 ; X32-NEXT: .cfi_offset %ebp, -8 14 ; X32-NEXT: movl %esp, %ebp 15 ; X32-NEXT: .cfi_def_cfa_register %ebp 16 ; X32-NEXT: andl $-8, %esp 17 ; X32-NEXT: subl $32, %esp 18 ; X32-NEXT: movl 12(%ebp), %ecx 19 ; X32-NEXT: movl 8(%ebp), %eax 20 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 21 ; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 22 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 23 ; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 24 ; X32-NEXT: paddw %xmm0, %xmm1 25 ; X32-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] 26 ; X32-NEXT: pand %xmm0, %xmm1 27 ; X32-NEXT: packuswb %xmm1, %xmm1 28 ; X32-NEXT: movq %xmm1, {{[0-9]+}}(%esp) 29 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 30 ; X32-NEXT: movq %xmm1, (%eax) 31 ; X32-NEXT: paddsb (%ecx), %mm0 32 ; X32-NEXT: movq %mm0, (%eax) 33 ; X32-NEXT: paddusb (%ecx), %mm0 34 ; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) 35 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 36 ; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 37 ; X32-NEXT: movq %mm0, (%eax) 38 ; X32-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 39 ; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 40 ; X32-NEXT: psubw %xmm2, %xmm1 41 ; X32-NEXT: pand %xmm0, %xmm1 42 ; X32-NEXT: packuswb %xmm1, %xmm1 43 ; X32-NEXT: movq %xmm1, {{[0-9]+}}(%esp) 44 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 45 ; X32-NEXT: movq %xmm1, (%eax) 46 ; X32-NEXT: psubsb (%ecx), %mm0 47 ; X32-NEXT: movq %mm0, (%eax) 48 ; X32-NEXT: psubusb (%ecx), %mm0 49 ; X32-NEXT: movq %mm0, (%esp) 50 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 51 ; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 52 ; X32-NEXT: movq %mm0, (%eax) 53 ; X32-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 54 ; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 55 ; X32-NEXT: pmullw %xmm1, %xmm2 56 ; X32-NEXT: movdqa %xmm2, %xmm1 57 ; X32-NEXT: pand %xmm0, %xmm1 58 ; X32-NEXT: packuswb %xmm1, %xmm1 59 ; X32-NEXT: movq %xmm1, (%eax) 60 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 61 ; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 62 ; X32-NEXT: pand %xmm2, %xmm1 63 ; X32-NEXT: movdqa %xmm1, %xmm2 64 ; X32-NEXT: pand %xmm0, %xmm2 65 ; X32-NEXT: packuswb %xmm2, %xmm2 66 ; X32-NEXT: movq %xmm2, (%eax) 67 ; X32-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 68 ; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 69 ; X32-NEXT: por %xmm1, %xmm2 70 ; X32-NEXT: movdqa %xmm2, %xmm1 71 ; X32-NEXT: pand %xmm0, %xmm1 72 ; X32-NEXT: packuswb %xmm1, %xmm1 73 ; X32-NEXT: movq %xmm1, (%eax) 74 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 75 ; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 76 ; X32-NEXT: pxor %xmm2, %xmm1 77 ; X32-NEXT: pand %xmm0, %xmm1 78 ; X32-NEXT: packuswb %xmm1, %xmm1 79 ; X32-NEXT: movq %xmm1, (%eax) 80 ; X32-NEXT: emms 81 ; X32-NEXT: movl %ebp, %esp 82 ; X32-NEXT: popl %ebp 83 ; X32-NEXT: .cfi_def_cfa %esp, 4 84 ; X32-NEXT: retl 85 ; 86 ; X64-LABEL: test0: 87 ; X64: # %bb.0: # %entry 88 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 89 ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 90 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 91 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 92 ; X64-NEXT: paddw %xmm0, %xmm1 93 ; X64-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] 94 ; X64-NEXT: pand %xmm0, %xmm1 95 ; X64-NEXT: packuswb %xmm1, %xmm1 96 ; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) 97 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 98 ; X64-NEXT: movq %xmm1, (%rdi) 99 ; X64-NEXT: paddsb (%rsi), %mm0 100 ; X64-NEXT: movq %mm0, (%rdi) 101 ; X64-NEXT: paddusb (%rsi), %mm0 102 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) 103 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 104 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 105 ; X64-NEXT: movq %mm0, (%rdi) 106 ; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 107 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 108 ; X64-NEXT: psubw %xmm2, %xmm1 109 ; X64-NEXT: pand %xmm0, %xmm1 110 ; X64-NEXT: packuswb %xmm1, %xmm1 111 ; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) 112 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 113 ; X64-NEXT: movq %xmm1, (%rdi) 114 ; X64-NEXT: psubsb (%rsi), %mm0 115 ; X64-NEXT: movq %mm0, (%rdi) 116 ; X64-NEXT: psubusb (%rsi), %mm0 117 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) 118 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 119 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 120 ; X64-NEXT: movq %mm0, (%rdi) 121 ; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 122 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 123 ; X64-NEXT: pmullw %xmm1, %xmm2 124 ; X64-NEXT: movdqa %xmm2, %xmm1 125 ; X64-NEXT: pand %xmm0, %xmm1 126 ; X64-NEXT: packuswb %xmm1, %xmm1 127 ; X64-NEXT: movq %xmm1, (%rdi) 128 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 129 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 130 ; X64-NEXT: pand %xmm2, %xmm1 131 ; X64-NEXT: movdqa %xmm1, %xmm2 132 ; X64-NEXT: pand %xmm0, %xmm2 133 ; X64-NEXT: packuswb %xmm2, %xmm2 134 ; X64-NEXT: movq %xmm2, (%rdi) 135 ; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 136 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 137 ; X64-NEXT: por %xmm1, %xmm2 138 ; X64-NEXT: movdqa %xmm2, %xmm1 139 ; X64-NEXT: pand %xmm0, %xmm1 140 ; X64-NEXT: packuswb %xmm1, %xmm1 141 ; X64-NEXT: movq %xmm1, (%rdi) 142 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 143 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 144 ; X64-NEXT: pxor %xmm2, %xmm1 145 ; X64-NEXT: pand %xmm0, %xmm1 146 ; X64-NEXT: packuswb %xmm1, %xmm1 147 ; X64-NEXT: movq %xmm1, (%rdi) 148 ; X64-NEXT: emms 149 ; X64-NEXT: retq 150 entry: 151 %tmp1 = load x86_mmx, x86_mmx* %A 152 %tmp3 = load x86_mmx, x86_mmx* %B 153 %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8> 154 %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8> 155 %tmp4 = add <8 x i8> %tmp1a, %tmp3a 156 %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx 157 store x86_mmx %tmp4a, x86_mmx* %A 158 %tmp7 = load x86_mmx, x86_mmx* %B 159 %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %tmp4a, x86_mmx %tmp7) 160 store x86_mmx %tmp12, x86_mmx* %A 161 %tmp16 = load x86_mmx, x86_mmx* %B 162 %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %tmp12, x86_mmx %tmp16) 163 store x86_mmx %tmp21, x86_mmx* %A 164 %tmp27 = load x86_mmx, x86_mmx* %B 165 %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8> 166 %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8> 167 %tmp28 = sub <8 x i8> %tmp21a, %tmp27a 168 %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx 169 store x86_mmx %tmp28a, x86_mmx* %A 170 %tmp31 = load x86_mmx, x86_mmx* %B 171 %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %tmp28a, x86_mmx %tmp31) 172 store x86_mmx %tmp36, x86_mmx* %A 173 %tmp40 = load x86_mmx, x86_mmx* %B 174 %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %tmp36, x86_mmx %tmp40) 175 store x86_mmx %tmp45, x86_mmx* %A 176 %tmp51 = load x86_mmx, x86_mmx* %B 177 %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8> 178 %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8> 179 %tmp52 = mul <8 x i8> %tmp45a, %tmp51a 180 %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx 181 store x86_mmx %tmp52a, x86_mmx* %A 182 %tmp57 = load x86_mmx, x86_mmx* %B 183 %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8> 184 %tmp58 = and <8 x i8> %tmp52, %tmp57a 185 %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx 186 store x86_mmx %tmp58a, x86_mmx* %A 187 %tmp63 = load x86_mmx, x86_mmx* %B 188 %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8> 189 %tmp64 = or <8 x i8> %tmp58, %tmp63a 190 %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx 191 store x86_mmx %tmp64a, x86_mmx* %A 192 %tmp69 = load x86_mmx, x86_mmx* %B 193 %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8> 194 %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8> 195 %tmp70 = xor <8 x i8> %tmp64b, %tmp69a 196 %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx 197 store x86_mmx %tmp70a, x86_mmx* %A 198 tail call void @llvm.x86.mmx.emms() 199 ret void 200 } 201 202 define void @test1(x86_mmx* %A, x86_mmx* %B) { 203 ; X32-LABEL: test1: 204 ; X32: # %bb.0: # %entry 205 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 206 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 207 ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 208 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] 209 ; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 210 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] 211 ; X32-NEXT: paddq %xmm0, %xmm1 212 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 213 ; X32-NEXT: movq %xmm0, (%eax) 214 ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 215 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] 216 ; X32-NEXT: movdqa %xmm1, %xmm2 217 ; X32-NEXT: psrlq $32, %xmm2 218 ; X32-NEXT: pmuludq %xmm0, %xmm2 219 ; X32-NEXT: movdqa %xmm0, %xmm3 220 ; X32-NEXT: psrlq $32, %xmm3 221 ; X32-NEXT: pmuludq %xmm1, %xmm3 222 ; X32-NEXT: paddq %xmm2, %xmm3 223 ; X32-NEXT: psllq $32, %xmm3 224 ; X32-NEXT: pmuludq %xmm1, %xmm0 225 ; X32-NEXT: paddq %xmm3, %xmm0 226 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 227 ; X32-NEXT: movq %xmm1, (%eax) 228 ; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 229 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] 230 ; X32-NEXT: andps %xmm0, %xmm1 231 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 232 ; X32-NEXT: movq %xmm0, (%eax) 233 ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 234 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] 235 ; X32-NEXT: orps %xmm1, %xmm0 236 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 237 ; X32-NEXT: movq %xmm1, (%eax) 238 ; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 239 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] 240 ; X32-NEXT: xorps %xmm0, %xmm1 241 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 242 ; X32-NEXT: movq %xmm0, (%eax) 243 ; X32-NEXT: emms 244 ; X32-NEXT: retl 245 ; 246 ; X64-LABEL: test1: 247 ; X64: # %bb.0: # %entry 248 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 249 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 250 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 251 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 252 ; X64-NEXT: paddq %xmm0, %xmm1 253 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 254 ; X64-NEXT: movq %xmm0, (%rdi) 255 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 256 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 257 ; X64-NEXT: movdqa %xmm1, %xmm2 258 ; X64-NEXT: psrlq $32, %xmm2 259 ; X64-NEXT: pmuludq %xmm0, %xmm2 260 ; X64-NEXT: movdqa %xmm0, %xmm3 261 ; X64-NEXT: psrlq $32, %xmm3 262 ; X64-NEXT: pmuludq %xmm1, %xmm3 263 ; X64-NEXT: paddq %xmm2, %xmm3 264 ; X64-NEXT: psllq $32, %xmm3 265 ; X64-NEXT: pmuludq %xmm0, %xmm1 266 ; X64-NEXT: paddq %xmm3, %xmm1 267 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 268 ; X64-NEXT: movq %xmm0, (%rdi) 269 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 270 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 271 ; X64-NEXT: pand %xmm1, %xmm0 272 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 273 ; X64-NEXT: movq %xmm1, (%rdi) 274 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 275 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 276 ; X64-NEXT: por %xmm0, %xmm1 277 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 278 ; X64-NEXT: movq %xmm0, (%rdi) 279 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 280 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 281 ; X64-NEXT: pxor %xmm1, %xmm0 282 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 283 ; X64-NEXT: movq %xmm0, (%rdi) 284 ; X64-NEXT: emms 285 ; X64-NEXT: retq 286 entry: 287 %tmp1 = load x86_mmx, x86_mmx* %A 288 %tmp3 = load x86_mmx, x86_mmx* %B 289 %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32> 290 %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32> 291 %tmp4 = add <2 x i32> %tmp1a, %tmp3a 292 %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx 293 store x86_mmx %tmp4a, x86_mmx* %A 294 %tmp9 = load x86_mmx, x86_mmx* %B 295 %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32> 296 %tmp10 = sub <2 x i32> %tmp4, %tmp9a 297 %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx 298 store x86_mmx %tmp10a, x86_mmx* %A 299 %tmp15 = load x86_mmx, x86_mmx* %B 300 %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32> 301 %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32> 302 %tmp16 = mul <2 x i32> %tmp10b, %tmp15a 303 %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx 304 store x86_mmx %tmp16a, x86_mmx* %A 305 %tmp21 = load x86_mmx, x86_mmx* %B 306 %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32> 307 %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32> 308 %tmp22 = and <2 x i32> %tmp16b, %tmp21a 309 %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx 310 store x86_mmx %tmp22a, x86_mmx* %A 311 %tmp27 = load x86_mmx, x86_mmx* %B 312 %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32> 313 %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32> 314 %tmp28 = or <2 x i32> %tmp22b, %tmp27a 315 %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx 316 store x86_mmx %tmp28a, x86_mmx* %A 317 %tmp33 = load x86_mmx, x86_mmx* %B 318 %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32> 319 %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32> 320 %tmp34 = xor <2 x i32> %tmp28b, %tmp33a 321 %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx 322 store x86_mmx %tmp34a, x86_mmx* %A 323 tail call void @llvm.x86.mmx.emms( ) 324 ret void 325 } 326 327 define void @test2(x86_mmx* %A, x86_mmx* %B) { 328 ; X32-LABEL: test2: 329 ; X32: # %bb.0: # %entry 330 ; X32-NEXT: pushl %ebp 331 ; X32-NEXT: .cfi_def_cfa_offset 8 332 ; X32-NEXT: .cfi_offset %ebp, -8 333 ; X32-NEXT: movl %esp, %ebp 334 ; X32-NEXT: .cfi_def_cfa_register %ebp 335 ; X32-NEXT: andl $-8, %esp 336 ; X32-NEXT: subl $48, %esp 337 ; X32-NEXT: movl 12(%ebp), %ecx 338 ; X32-NEXT: movl 8(%ebp), %eax 339 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 340 ; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 341 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 342 ; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 343 ; X32-NEXT: paddd %xmm0, %xmm1 344 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 345 ; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 346 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 347 ; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) 348 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 349 ; X32-NEXT: movq %xmm0, (%eax) 350 ; X32-NEXT: paddsw (%ecx), %mm0 351 ; X32-NEXT: movq %mm0, (%eax) 352 ; X32-NEXT: paddusw (%ecx), %mm0 353 ; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) 354 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 355 ; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 356 ; X32-NEXT: movq %mm0, (%eax) 357 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 358 ; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 359 ; X32-NEXT: psubd %xmm1, %xmm0 360 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 361 ; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 362 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 363 ; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) 364 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 365 ; X32-NEXT: movq %xmm0, (%eax) 366 ; X32-NEXT: psubsw (%ecx), %mm0 367 ; X32-NEXT: movq %mm0, (%eax) 368 ; X32-NEXT: psubusw (%ecx), %mm0 369 ; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) 370 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 371 ; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 372 ; X32-NEXT: movq %mm0, (%eax) 373 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 374 ; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 375 ; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 376 ; X32-NEXT: pmuludq %xmm1, %xmm0 377 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 378 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 379 ; X32-NEXT: pmuludq %xmm2, %xmm1 380 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 381 ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 382 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 383 ; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 384 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 385 ; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) 386 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 387 ; X32-NEXT: movq %xmm0, (%eax) 388 ; X32-NEXT: pmulhw (%ecx), %mm0 389 ; X32-NEXT: movq %mm0, (%eax) 390 ; X32-NEXT: pmaddwd (%ecx), %mm0 391 ; X32-NEXT: movq %mm0, (%esp) 392 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 393 ; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 394 ; X32-NEXT: movq %mm0, (%eax) 395 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 396 ; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 397 ; X32-NEXT: pand %xmm0, %xmm1 398 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 399 ; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 400 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 401 ; X32-NEXT: movq %xmm0, (%eax) 402 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 403 ; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 404 ; X32-NEXT: por %xmm1, %xmm0 405 ; X32-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] 406 ; X32-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 407 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 408 ; X32-NEXT: movq %xmm1, (%eax) 409 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 410 ; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 411 ; X32-NEXT: pxor %xmm0, %xmm1 412 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 413 ; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 414 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 415 ; X32-NEXT: movq %xmm0, (%eax) 416 ; X32-NEXT: emms 417 ; X32-NEXT: movl %ebp, %esp 418 ; X32-NEXT: popl %ebp 419 ; X32-NEXT: .cfi_def_cfa %esp, 4 420 ; X32-NEXT: retl 421 ; 422 ; X64-LABEL: test2: 423 ; X64: # %bb.0: # %entry 424 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 425 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 426 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 427 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 428 ; X64-NEXT: paddd %xmm0, %xmm1 429 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 430 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 431 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 432 ; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) 433 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 434 ; X64-NEXT: movq %xmm0, (%rdi) 435 ; X64-NEXT: paddsw (%rsi), %mm0 436 ; X64-NEXT: movq %mm0, (%rdi) 437 ; X64-NEXT: paddusw (%rsi), %mm0 438 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) 439 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 440 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 441 ; X64-NEXT: movq %mm0, (%rdi) 442 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 443 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 444 ; X64-NEXT: psubd %xmm1, %xmm0 445 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 446 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 447 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 448 ; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) 449 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 450 ; X64-NEXT: movq %xmm0, (%rdi) 451 ; X64-NEXT: psubsw (%rsi), %mm0 452 ; X64-NEXT: movq %mm0, (%rdi) 453 ; X64-NEXT: psubusw (%rsi), %mm0 454 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) 455 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 456 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 457 ; X64-NEXT: movq %mm0, (%rdi) 458 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 459 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 460 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 461 ; X64-NEXT: pmuludq %xmm1, %xmm0 462 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 463 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 464 ; X64-NEXT: pmuludq %xmm2, %xmm1 465 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 466 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 467 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 468 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 469 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 470 ; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) 471 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 472 ; X64-NEXT: movq %xmm0, (%rdi) 473 ; X64-NEXT: pmulhw (%rsi), %mm0 474 ; X64-NEXT: movq %mm0, (%rdi) 475 ; X64-NEXT: pmaddwd (%rsi), %mm0 476 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) 477 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 478 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 479 ; X64-NEXT: movq %mm0, (%rdi) 480 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 481 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 482 ; X64-NEXT: pand %xmm0, %xmm1 483 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 484 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 485 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 486 ; X64-NEXT: movq %xmm0, (%rdi) 487 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 488 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 489 ; X64-NEXT: por %xmm1, %xmm0 490 ; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] 491 ; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 492 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 493 ; X64-NEXT: movq %xmm1, (%rdi) 494 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 495 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 496 ; X64-NEXT: pxor %xmm0, %xmm1 497 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 498 ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 499 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 500 ; X64-NEXT: movq %xmm0, (%rdi) 501 ; X64-NEXT: emms 502 ; X64-NEXT: retq 503 entry: 504 %tmp1 = load x86_mmx, x86_mmx* %A 505 %tmp3 = load x86_mmx, x86_mmx* %B 506 %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16> 507 %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16> 508 %tmp4 = add <4 x i16> %tmp1a, %tmp3a 509 %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx 510 store x86_mmx %tmp4a, x86_mmx* %A 511 %tmp7 = load x86_mmx, x86_mmx* %B 512 %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %tmp4a, x86_mmx %tmp7) 513 store x86_mmx %tmp12, x86_mmx* %A 514 %tmp16 = load x86_mmx, x86_mmx* %B 515 %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp12, x86_mmx %tmp16) 516 store x86_mmx %tmp21, x86_mmx* %A 517 %tmp27 = load x86_mmx, x86_mmx* %B 518 %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16> 519 %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16> 520 %tmp28 = sub <4 x i16> %tmp21a, %tmp27a 521 %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx 522 store x86_mmx %tmp28a, x86_mmx* %A 523 %tmp31 = load x86_mmx, x86_mmx* %B 524 %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %tmp28a, x86_mmx %tmp31) 525 store x86_mmx %tmp36, x86_mmx* %A 526 %tmp40 = load x86_mmx, x86_mmx* %B 527 %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %tmp36, x86_mmx %tmp40) 528 store x86_mmx %tmp45, x86_mmx* %A 529 %tmp51 = load x86_mmx, x86_mmx* %B 530 %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16> 531 %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16> 532 %tmp52 = mul <4 x i16> %tmp45a, %tmp51a 533 %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx 534 store x86_mmx %tmp52a, x86_mmx* %A 535 %tmp55 = load x86_mmx, x86_mmx* %B 536 %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %tmp52a, x86_mmx %tmp55) 537 store x86_mmx %tmp60, x86_mmx* %A 538 %tmp64 = load x86_mmx, x86_mmx* %B 539 %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %tmp60, x86_mmx %tmp64) 540 %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx 541 store x86_mmx %tmp70, x86_mmx* %A 542 %tmp75 = load x86_mmx, x86_mmx* %B 543 %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16> 544 %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16> 545 %tmp76 = and <4 x i16> %tmp70a, %tmp75a 546 %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx 547 store x86_mmx %tmp76a, x86_mmx* %A 548 %tmp81 = load x86_mmx, x86_mmx* %B 549 %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16> 550 %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16> 551 %tmp82 = or <4 x i16> %tmp76b, %tmp81a 552 %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx 553 store x86_mmx %tmp82a, x86_mmx* %A 554 %tmp87 = load x86_mmx, x86_mmx* %B 555 %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16> 556 %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16> 557 %tmp88 = xor <4 x i16> %tmp82b, %tmp87a 558 %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx 559 store x86_mmx %tmp88a, x86_mmx* %A 560 tail call void @llvm.x86.mmx.emms( ) 561 ret void 562 } 563 564 define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind { 565 ; X32-LABEL: test3: 566 ; X32: # %bb.0: # %entry 567 ; X32-NEXT: pushl %ebp 568 ; X32-NEXT: movl %esp, %ebp 569 ; X32-NEXT: pushl %ebx 570 ; X32-NEXT: pushl %edi 571 ; X32-NEXT: pushl %esi 572 ; X32-NEXT: andl $-8, %esp 573 ; X32-NEXT: subl $16, %esp 574 ; X32-NEXT: cmpl $0, 16(%ebp) 575 ; X32-NEXT: je .LBB3_1 576 ; X32-NEXT: # %bb.2: # %bb26.preheader 577 ; X32-NEXT: xorl %ebx, %ebx 578 ; X32-NEXT: xorl %eax, %eax 579 ; X32-NEXT: xorl %edx, %edx 580 ; X32-NEXT: .p2align 4, 0x90 581 ; X32-NEXT: .LBB3_3: # %bb26 582 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 583 ; X32-NEXT: movl 8(%ebp), %ecx 584 ; X32-NEXT: movl %ecx, %esi 585 ; X32-NEXT: movl (%ecx,%ebx,8), %ecx 586 ; X32-NEXT: movl 4(%esi,%ebx,8), %esi 587 ; X32-NEXT: movl 12(%ebp), %edi 588 ; X32-NEXT: addl (%edi,%ebx,8), %ecx 589 ; X32-NEXT: adcl 4(%edi,%ebx,8), %esi 590 ; X32-NEXT: addl %eax, %ecx 591 ; X32-NEXT: movl %ecx, (%esp) 592 ; X32-NEXT: adcl %edx, %esi 593 ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) 594 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 595 ; X32-NEXT: movd %xmm0, %eax 596 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] 597 ; X32-NEXT: movd %xmm0, %edx 598 ; X32-NEXT: incl %ebx 599 ; X32-NEXT: cmpl 16(%ebp), %ebx 600 ; X32-NEXT: jb .LBB3_3 601 ; X32-NEXT: jmp .LBB3_4 602 ; X32-NEXT: .LBB3_1: 603 ; X32-NEXT: xorl %eax, %eax 604 ; X32-NEXT: xorl %edx, %edx 605 ; X32-NEXT: .LBB3_4: # %bb31 606 ; X32-NEXT: leal -12(%ebp), %esp 607 ; X32-NEXT: popl %esi 608 ; X32-NEXT: popl %edi 609 ; X32-NEXT: popl %ebx 610 ; X32-NEXT: popl %ebp 611 ; X32-NEXT: retl 612 ; 613 ; X64-LABEL: test3: 614 ; X64: # %bb.0: # %entry 615 ; X64-NEXT: xorl %r8d, %r8d 616 ; X64-NEXT: xorl %eax, %eax 617 ; X64-NEXT: testl %edx, %edx 618 ; X64-NEXT: je .LBB3_2 619 ; X64-NEXT: .p2align 4, 0x90 620 ; X64-NEXT: .LBB3_1: # %bb26 621 ; X64-NEXT: # =>This Inner Loop Header: Depth=1 622 ; X64-NEXT: movslq %r8d, %r8 623 ; X64-NEXT: movq (%rdi,%r8,8), %rcx 624 ; X64-NEXT: addq (%rsi,%r8,8), %rcx 625 ; X64-NEXT: addq %rcx, %rax 626 ; X64-NEXT: incl %r8d 627 ; X64-NEXT: cmpl %edx, %r8d 628 ; X64-NEXT: jb .LBB3_1 629 ; X64-NEXT: .LBB3_2: # %bb31 630 ; X64-NEXT: retq 631 entry: 632 %tmp2942 = icmp eq i32 %count, 0 633 br i1 %tmp2942, label %bb31, label %bb26 634 635 bb26: 636 %i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ] 637 %sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ] 638 %tmp13 = getelementptr <1 x i64>, <1 x i64>* %b, i32 %i.037.0 639 %tmp14 = load <1 x i64>, <1 x i64>* %tmp13 640 %tmp18 = getelementptr <1 x i64>, <1 x i64>* %a, i32 %i.037.0 641 %tmp19 = load <1 x i64>, <1 x i64>* %tmp18 642 %tmp21 = add <1 x i64> %tmp19, %tmp14 643 %tmp22 = add <1 x i64> %tmp21, %sum.035.0 644 %tmp25 = add i32 %i.037.0, 1 645 %tmp29 = icmp ult i32 %tmp25, %count 646 br i1 %tmp29, label %bb26, label %bb31 647 648 bb31: 649 %sum.035.1 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ] 650 ret <1 x i64> %sum.035.1 651 } 652 653 ; There are no MMX operations here, so we use XMM or i64. 654 define void @ti8(double %a, double %b) nounwind { 655 ; X32-LABEL: ti8: 656 ; X32: # %bb.0: # %entry 657 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 658 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 659 ; X32-NEXT: paddb %xmm0, %xmm1 660 ; X32-NEXT: movq %xmm1, 0 661 ; X32-NEXT: retl 662 ; 663 ; X64-LABEL: ti8: 664 ; X64: # %bb.0: # %entry 665 ; X64-NEXT: paddb %xmm1, %xmm0 666 ; X64-NEXT: movq %xmm0, 0 667 ; X64-NEXT: retq 668 entry: 669 %tmp1 = bitcast double %a to <8 x i8> 670 %tmp2 = bitcast double %b to <8 x i8> 671 %tmp3 = add <8 x i8> %tmp1, %tmp2 672 store <8 x i8> %tmp3, <8 x i8>* null 673 ret void 674 } 675 676 define void @ti16(double %a, double %b) nounwind { 677 ; X32-LABEL: ti16: 678 ; X32: # %bb.0: # %entry 679 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 680 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 681 ; X32-NEXT: paddw %xmm0, %xmm1 682 ; X32-NEXT: movq %xmm1, 0 683 ; X32-NEXT: retl 684 ; 685 ; X64-LABEL: ti16: 686 ; X64: # %bb.0: # %entry 687 ; X64-NEXT: paddw %xmm1, %xmm0 688 ; X64-NEXT: movq %xmm0, 0 689 ; X64-NEXT: retq 690 entry: 691 %tmp1 = bitcast double %a to <4 x i16> 692 %tmp2 = bitcast double %b to <4 x i16> 693 %tmp3 = add <4 x i16> %tmp1, %tmp2 694 store <4 x i16> %tmp3, <4 x i16>* null 695 ret void 696 } 697 698 define void @ti32(double %a, double %b) nounwind { 699 ; X32-LABEL: ti32: 700 ; X32: # %bb.0: # %entry 701 ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 702 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 703 ; X32-NEXT: paddd %xmm0, %xmm1 704 ; X32-NEXT: movq %xmm1, 0 705 ; X32-NEXT: retl 706 ; 707 ; X64-LABEL: ti32: 708 ; X64: # %bb.0: # %entry 709 ; X64-NEXT: paddd %xmm1, %xmm0 710 ; X64-NEXT: movq %xmm0, 0 711 ; X64-NEXT: retq 712 entry: 713 %tmp1 = bitcast double %a to <2 x i32> 714 %tmp2 = bitcast double %b to <2 x i32> 715 %tmp3 = add <2 x i32> %tmp1, %tmp2 716 store <2 x i32> %tmp3, <2 x i32>* null 717 ret void 718 } 719 720 define void @ti64(double %a, double %b) nounwind { 721 ; X32-LABEL: ti64: 722 ; X32: # %bb.0: # %entry 723 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 724 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 725 ; X32-NEXT: addl {{[0-9]+}}(%esp), %eax 726 ; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx 727 ; X32-NEXT: movl %eax, 0 728 ; X32-NEXT: movl %ecx, 4 729 ; X32-NEXT: retl 730 ; 731 ; X64-LABEL: ti64: 732 ; X64: # %bb.0: # %entry 733 ; X64-NEXT: movq %xmm0, %rax 734 ; X64-NEXT: movq %xmm1, %rcx 735 ; X64-NEXT: addq %rax, %rcx 736 ; X64-NEXT: movq %rcx, 0 737 ; X64-NEXT: retq 738 entry: 739 %tmp1 = bitcast double %a to <1 x i64> 740 %tmp2 = bitcast double %b to <1 x i64> 741 %tmp3 = add <1 x i64> %tmp1, %tmp2 742 store <1 x i64> %tmp3, <1 x i64>* null 743 ret void 744 } 745 746 ; MMX intrinsics calls get us MMX instructions. 747 define void @ti8a(double %a, double %b) nounwind { 748 ; X32-LABEL: ti8a: 749 ; X32: # %bb.0: # %entry 750 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 751 ; X32-NEXT: paddb {{[0-9]+}}(%esp), %mm0 752 ; X32-NEXT: movq %mm0, 0 753 ; X32-NEXT: retl 754 ; 755 ; X64-LABEL: ti8a: 756 ; X64: # %bb.0: # %entry 757 ; X64-NEXT: movdq2q %xmm0, %mm0 758 ; X64-NEXT: movdq2q %xmm1, %mm1 759 ; X64-NEXT: paddb %mm0, %mm1 760 ; X64-NEXT: movq %mm1, 0 761 ; X64-NEXT: retq 762 entry: 763 %tmp1 = bitcast double %a to x86_mmx 764 %tmp2 = bitcast double %b to x86_mmx 765 %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2) 766 store x86_mmx %tmp3, x86_mmx* null 767 ret void 768 } 769 770 define void @ti16a(double %a, double %b) nounwind { 771 ; X32-LABEL: ti16a: 772 ; X32: # %bb.0: # %entry 773 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 774 ; X32-NEXT: paddw {{[0-9]+}}(%esp), %mm0 775 ; X32-NEXT: movq %mm0, 0 776 ; X32-NEXT: retl 777 ; 778 ; X64-LABEL: ti16a: 779 ; X64: # %bb.0: # %entry 780 ; X64-NEXT: movdq2q %xmm0, %mm0 781 ; X64-NEXT: movdq2q %xmm1, %mm1 782 ; X64-NEXT: paddw %mm0, %mm1 783 ; X64-NEXT: movq %mm1, 0 784 ; X64-NEXT: retq 785 entry: 786 %tmp1 = bitcast double %a to x86_mmx 787 %tmp2 = bitcast double %b to x86_mmx 788 %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2) 789 store x86_mmx %tmp3, x86_mmx* null 790 ret void 791 } 792 793 define void @ti32a(double %a, double %b) nounwind { 794 ; X32-LABEL: ti32a: 795 ; X32: # %bb.0: # %entry 796 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 797 ; X32-NEXT: paddd {{[0-9]+}}(%esp), %mm0 798 ; X32-NEXT: movq %mm0, 0 799 ; X32-NEXT: retl 800 ; 801 ; X64-LABEL: ti32a: 802 ; X64: # %bb.0: # %entry 803 ; X64-NEXT: movdq2q %xmm0, %mm0 804 ; X64-NEXT: movdq2q %xmm1, %mm1 805 ; X64-NEXT: paddd %mm0, %mm1 806 ; X64-NEXT: movq %mm1, 0 807 ; X64-NEXT: retq 808 entry: 809 %tmp1 = bitcast double %a to x86_mmx 810 %tmp2 = bitcast double %b to x86_mmx 811 %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2) 812 store x86_mmx %tmp3, x86_mmx* null 813 ret void 814 } 815 816 define void @ti64a(double %a, double %b) nounwind { 817 ; X32-LABEL: ti64a: 818 ; X32: # %bb.0: # %entry 819 ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm0 820 ; X32-NEXT: paddq {{[0-9]+}}(%esp), %mm0 821 ; X32-NEXT: movq %mm0, 0 822 ; X32-NEXT: retl 823 ; 824 ; X64-LABEL: ti64a: 825 ; X64: # %bb.0: # %entry 826 ; X64-NEXT: movdq2q %xmm0, %mm0 827 ; X64-NEXT: movdq2q %xmm1, %mm1 828 ; X64-NEXT: paddq %mm0, %mm1 829 ; X64-NEXT: movq %mm1, 0 830 ; X64-NEXT: retq 831 entry: 832 %tmp1 = bitcast double %a to x86_mmx 833 %tmp2 = bitcast double %b to x86_mmx 834 %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2) 835 store x86_mmx %tmp3, x86_mmx* null 836 ret void 837 } 838 839 declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) 840 declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) 841 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) 842 declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) 843 844 declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) 845 declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) 846 declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) 847 declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) 848 declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) 849 declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) 850 851 declare void @llvm.x86.mmx.emms() 852 853 declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) 854 declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) 855 declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) 856 declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) 857 858