1 ; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck -check-prefix=X32 %s 2 ; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck -check-prefix=X64 %s 3 4 ;; A basic sanity check to make sure that MMX arithmetic actually compiles. 5 ;; First is a straight translation of the original with bitcasts as needed. 6 7 ; X32-LABEL: test0 8 ; X64-LABEL: test0 9 define void @test0(x86_mmx* %A, x86_mmx* %B) { 10 entry: 11 %tmp1 = load x86_mmx, x86_mmx* %A 12 %tmp3 = load x86_mmx, x86_mmx* %B 13 %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8> 14 %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8> 15 %tmp4 = add <8 x i8> %tmp1a, %tmp3a 16 %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx 17 store x86_mmx %tmp4a, x86_mmx* %A 18 %tmp7 = load x86_mmx, x86_mmx* %B 19 %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %tmp4a, x86_mmx %tmp7) 20 store x86_mmx %tmp12, x86_mmx* %A 21 %tmp16 = load x86_mmx, x86_mmx* %B 22 %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %tmp12, x86_mmx %tmp16) 23 store x86_mmx %tmp21, x86_mmx* %A 24 %tmp27 = load x86_mmx, x86_mmx* %B 25 %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8> 26 %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8> 27 %tmp28 = sub <8 x i8> %tmp21a, %tmp27a 28 %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx 29 store x86_mmx %tmp28a, x86_mmx* %A 30 %tmp31 = load x86_mmx, x86_mmx* %B 31 %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %tmp28a, x86_mmx %tmp31) 32 store x86_mmx %tmp36, x86_mmx* %A 33 %tmp40 = load x86_mmx, x86_mmx* %B 34 %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %tmp36, x86_mmx %tmp40) 35 store x86_mmx %tmp45, x86_mmx* %A 36 %tmp51 = load x86_mmx, x86_mmx* %B 37 %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8> 38 %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8> 39 %tmp52 = mul <8 x i8> %tmp45a, %tmp51a 40 %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx 41 store x86_mmx %tmp52a, x86_mmx* %A 42 %tmp57 = load x86_mmx, x86_mmx* %B 43 %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8> 44 %tmp58 = and <8 x i8> %tmp52, %tmp57a 45 %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx 46 store x86_mmx %tmp58a, x86_mmx* %A 47 %tmp63 = load x86_mmx, x86_mmx* %B 48 %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8> 49 %tmp64 = or <8 x i8> %tmp58, %tmp63a 50 %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx 51 store x86_mmx %tmp64a, x86_mmx* %A 52 %tmp69 = load x86_mmx, x86_mmx* %B 53 %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8> 54 %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8> 55 %tmp70 = xor <8 x i8> %tmp64b, %tmp69a 56 %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx 57 store x86_mmx %tmp70a, x86_mmx* %A 58 tail call void @llvm.x86.mmx.emms() 59 ret void 60 } 61 62 ; X32-LABEL: test1 63 ; X64-LABEL: test1 64 define void @test1(x86_mmx* %A, x86_mmx* %B) { 65 entry: 66 %tmp1 = load x86_mmx, x86_mmx* %A 67 %tmp3 = load x86_mmx, x86_mmx* %B 68 %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32> 69 %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32> 70 %tmp4 = add <2 x i32> %tmp1a, %tmp3a 71 %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx 72 store x86_mmx %tmp4a, x86_mmx* %A 73 %tmp9 = load x86_mmx, x86_mmx* %B 74 %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32> 75 %tmp10 = sub <2 x i32> %tmp4, %tmp9a 76 %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx 77 store x86_mmx %tmp10a, x86_mmx* %A 78 %tmp15 = load x86_mmx, x86_mmx* %B 79 %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32> 80 %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32> 81 %tmp16 = mul <2 x i32> %tmp10b, %tmp15a 82 %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx 83 store x86_mmx %tmp16a, x86_mmx* %A 84 %tmp21 = load x86_mmx, x86_mmx* %B 85 %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32> 86 %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32> 87 %tmp22 = and <2 x i32> %tmp16b, %tmp21a 88 %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx 89 store x86_mmx %tmp22a, x86_mmx* %A 90 %tmp27 = load x86_mmx, x86_mmx* %B 91 %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32> 92 %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32> 93 %tmp28 = or <2 x i32> %tmp22b, %tmp27a 94 %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx 95 store x86_mmx %tmp28a, x86_mmx* %A 96 %tmp33 = load x86_mmx, x86_mmx* %B 97 %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32> 98 %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32> 99 %tmp34 = xor <2 x i32> %tmp28b, %tmp33a 100 %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx 101 store x86_mmx %tmp34a, x86_mmx* %A 102 tail call void @llvm.x86.mmx.emms( ) 103 ret void 104 } 105 106 ; X32-LABEL: test2 107 ; X64-LABEL: test2 108 define void @test2(x86_mmx* %A, x86_mmx* %B) { 109 entry: 110 %tmp1 = load x86_mmx, x86_mmx* %A 111 %tmp3 = load x86_mmx, x86_mmx* %B 112 %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16> 113 %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16> 114 %tmp4 = add <4 x i16> %tmp1a, %tmp3a 115 %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx 116 store x86_mmx %tmp4a, x86_mmx* %A 117 %tmp7 = load x86_mmx, x86_mmx* %B 118 %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %tmp4a, x86_mmx %tmp7) 119 store x86_mmx %tmp12, x86_mmx* %A 120 %tmp16 = load x86_mmx, x86_mmx* %B 121 %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp12, x86_mmx %tmp16) 122 store x86_mmx %tmp21, x86_mmx* %A 123 %tmp27 = load x86_mmx, x86_mmx* %B 124 %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16> 125 %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16> 126 %tmp28 = sub <4 x i16> %tmp21a, %tmp27a 127 %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx 128 store x86_mmx %tmp28a, x86_mmx* %A 129 %tmp31 = load x86_mmx, x86_mmx* %B 130 %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %tmp28a, x86_mmx %tmp31) 131 store x86_mmx %tmp36, x86_mmx* %A 132 %tmp40 = load x86_mmx, x86_mmx* %B 133 %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %tmp36, x86_mmx %tmp40) 134 store x86_mmx %tmp45, x86_mmx* %A 135 %tmp51 = load x86_mmx, x86_mmx* %B 136 %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16> 137 %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16> 138 %tmp52 = mul <4 x i16> %tmp45a, %tmp51a 139 %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx 140 store x86_mmx %tmp52a, x86_mmx* %A 141 %tmp55 = load x86_mmx, x86_mmx* %B 142 %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %tmp52a, x86_mmx %tmp55) 143 store x86_mmx %tmp60, x86_mmx* %A 144 %tmp64 = load x86_mmx, x86_mmx* %B 145 %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %tmp60, x86_mmx %tmp64) 146 %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx 147 store x86_mmx %tmp70, x86_mmx* %A 148 %tmp75 = load x86_mmx, x86_mmx* %B 149 %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16> 150 %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16> 151 %tmp76 = and <4 x i16> %tmp70a, %tmp75a 152 %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx 153 store x86_mmx %tmp76a, x86_mmx* %A 154 %tmp81 = load x86_mmx, x86_mmx* %B 155 %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16> 156 %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16> 157 %tmp82 = or <4 x i16> %tmp76b, %tmp81a 158 %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx 159 store x86_mmx %tmp82a, x86_mmx* %A 160 %tmp87 = load x86_mmx, x86_mmx* %B 161 %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16> 162 %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16> 163 %tmp88 = xor <4 x i16> %tmp82b, %tmp87a 164 %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx 165 store x86_mmx %tmp88a, x86_mmx* %A 166 tail call void @llvm.x86.mmx.emms( ) 167 ret void 168 } 169 170 ; X32-LABEL: test3 171 define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind { 172 entry: 173 %tmp2942 = icmp eq i32 %count, 0 174 br i1 %tmp2942, label %bb31, label %bb26 175 176 bb26: 177 ; X32: addl 178 ; X32: adcl 179 %i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ] 180 %sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ] 181 %tmp13 = getelementptr <1 x i64>, <1 x i64>* %b, i32 %i.037.0 182 %tmp14 = load <1 x i64>, <1 x i64>* %tmp13 183 %tmp18 = getelementptr <1 x i64>, <1 x i64>* %a, i32 %i.037.0 184 %tmp19 = load <1 x i64>, <1 x i64>* %tmp18 185 %tmp21 = add <1 x i64> %tmp19, %tmp14 186 %tmp22 = add <1 x i64> %tmp21, %sum.035.0 187 %tmp25 = add i32 %i.037.0, 1 188 %tmp29 = icmp ult i32 %tmp25, %count 189 br i1 %tmp29, label %bb26, label %bb31 190 191 bb31: 192 %sum.035.1 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ] 193 ret <1 x i64> %sum.035.1 194 } 195 196 ; There are no MMX operations here, so we use XMM or i64. 197 ; X64-LABEL: ti8 198 define void @ti8(double %a, double %b) nounwind { 199 entry: 200 %tmp1 = bitcast double %a to <8 x i8> 201 %tmp2 = bitcast double %b to <8 x i8> 202 %tmp3 = add <8 x i8> %tmp1, %tmp2 203 ; X64: paddb 204 store <8 x i8> %tmp3, <8 x i8>* null 205 ret void 206 } 207 208 ; X64-LABEL: ti16 209 define void @ti16(double %a, double %b) nounwind { 210 entry: 211 %tmp1 = bitcast double %a to <4 x i16> 212 %tmp2 = bitcast double %b to <4 x i16> 213 %tmp3 = add <4 x i16> %tmp1, %tmp2 214 ; X64: paddw 215 store <4 x i16> %tmp3, <4 x i16>* null 216 ret void 217 } 218 219 ; X64-LABEL: ti32 220 define void @ti32(double %a, double %b) nounwind { 221 entry: 222 %tmp1 = bitcast double %a to <2 x i32> 223 %tmp2 = bitcast double %b to <2 x i32> 224 %tmp3 = add <2 x i32> %tmp1, %tmp2 225 ; X64: paddd 226 store <2 x i32> %tmp3, <2 x i32>* null 227 ret void 228 } 229 230 ; X64-LABEL: ti64 231 define void @ti64(double %a, double %b) nounwind { 232 entry: 233 %tmp1 = bitcast double %a to <1 x i64> 234 %tmp2 = bitcast double %b to <1 x i64> 235 %tmp3 = add <1 x i64> %tmp1, %tmp2 236 ; X64: addq 237 store <1 x i64> %tmp3, <1 x i64>* null 238 ret void 239 } 240 241 ; MMX intrinsics calls get us MMX instructions. 242 ; X64-LABEL: ti8a 243 define void @ti8a(double %a, double %b) nounwind { 244 entry: 245 %tmp1 = bitcast double %a to x86_mmx 246 ; X64: movdq2q 247 %tmp2 = bitcast double %b to x86_mmx 248 ; X64: movdq2q 249 %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2) 250 store x86_mmx %tmp3, x86_mmx* null 251 ret void 252 } 253 254 ; X64-LABEL: ti16a 255 define void @ti16a(double %a, double %b) nounwind { 256 entry: 257 %tmp1 = bitcast double %a to x86_mmx 258 ; X64: movdq2q 259 %tmp2 = bitcast double %b to x86_mmx 260 ; X64: movdq2q 261 %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2) 262 store x86_mmx %tmp3, x86_mmx* null 263 ret void 264 } 265 266 ; X64-LABEL: ti32a 267 define void @ti32a(double %a, double %b) nounwind { 268 entry: 269 %tmp1 = bitcast double %a to x86_mmx 270 ; X64: movdq2q 271 %tmp2 = bitcast double %b to x86_mmx 272 ; X64: movdq2q 273 %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2) 274 store x86_mmx %tmp3, x86_mmx* null 275 ret void 276 } 277 278 ; X64-LABEL: ti64a 279 define void @ti64a(double %a, double %b) nounwind { 280 entry: 281 %tmp1 = bitcast double %a to x86_mmx 282 ; X64: movdq2q 283 %tmp2 = bitcast double %b to x86_mmx 284 ; X64: movdq2q 285 %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2) 286 store x86_mmx %tmp3, x86_mmx* null 287 ret void 288 } 289 290 declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) 291 declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) 292 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) 293 declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) 294 295 declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) 296 declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) 297 declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) 298 declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) 299 declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) 300 declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) 301 302 declare void @llvm.x86.mmx.emms() 303 304 declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) 305 declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) 306 declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) 307 declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) 308 309