1 ; Tests for SSE2 and below, without SSE3+. 2 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s 3 4 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { 5 ; CHECK-LABEL: test1: 6 ; CHECK: ## BB#0: 7 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 9 ; CHECK-NEXT: movapd (%ecx), %xmm0 10 ; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0 11 ; CHECK-NEXT: movapd %xmm0, (%eax) 12 ; CHECK-NEXT: retl 13 %tmp3 = load <2 x double>, <2 x double>* %A, align 16 14 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 15 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 > 16 store <2 x double> %tmp9, <2 x double>* %r, align 16 17 ret void 18 } 19 20 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { 21 ; CHECK-LABEL: test2: 22 ; CHECK: ## BB#0: 23 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 24 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 25 ; CHECK-NEXT: movapd (%ecx), %xmm0 26 ; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0 27 ; CHECK-NEXT: movapd %xmm0, (%eax) 28 ; CHECK-NEXT: retl 29 %tmp3 = load <2 x double>, <2 x double>* %A, align 16 30 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 31 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 > 32 store <2 x double> %tmp9, <2 x double>* %r, align 16 33 ret void 34 } 35 36 37 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind { 38 ; CHECK-LABEL: test3: 39 ; CHECK: ## BB#0: 40 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 41 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 42 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx 43 ; CHECK-NEXT: movaps (%edx), %xmm0 44 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 45 ; CHECK-NEXT: movaps %xmm0, (%eax) 46 ; CHECK-NEXT: retl 47 %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2] 48 %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2] 49 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1] 50 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1] 51 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1] 52 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1] 53 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1] 54 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1] 55 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1] 56 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1] 57 store <4 x float> %tmp13, <4 x float>* %res 58 ret void 59 } 60 61 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind { 62 ; CHECK-LABEL: test4: 63 ; CHECK: ## BB#0: 64 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 65 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] 66 ; CHECK-NEXT: movaps %xmm0, (%eax) 67 ; CHECK-NEXT: retl 68 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] 69 store <4 x float> %tmp5, <4 x float>* %res 70 ret void 71 } 72 73 define <4 x i32> @test5(i8** %ptr) nounwind { 74 ; CHECK-LABEL: test5: 75 ; CHECK: ## BB#0: 76 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 77 ; CHECK-NEXT: movl (%eax), %eax 78 ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 79 ; CHECK-NEXT: pxor %xmm0, %xmm0 80 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 81 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 82 ; CHECK-NEXT: retl 83 %tmp = load i8*, i8** %ptr ; <i8*> [#uses=1] 84 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1] 85 %tmp.upgrd.2 = load float, float* %tmp.upgrd.1 ; <float> [#uses=1] 86 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1] 87 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] 88 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] 89 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] 90 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1] 91 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1] 92 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1] 93 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1] 94 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1] 95 ret <4 x i32> %tmp36 96 } 97 98 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { 99 ; CHECK-LABEL: test6: 100 ; CHECK: ## BB#0: 101 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 102 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 103 ; CHECK-NEXT: movaps (%ecx), %xmm0 104 ; CHECK-NEXT: movaps %xmm0, (%eax) 105 ; CHECK-NEXT: retl 106 %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1] 107 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 108 store <4 x float> %tmp2, <4 x float>* %res 109 ret void 110 } 111 112 define void @test7() nounwind { 113 ; CHECK-LABEL: test7: 114 ; CHECK: ## BB#0: 115 ; CHECK-NEXT: xorps %xmm0, %xmm0 116 ; CHECK-NEXT: movaps %xmm0, 0 117 ; CHECK-NEXT: retl 118 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] 119 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] 120 store <4 x float> %2, <4 x float>* null 121 ret void 122 } 123 124 @x = external global [4 x i32] 125 126 define <2 x i64> @test8() nounwind { 127 ; CHECK-LABEL: test8: 128 ; CHECK: ## BB#0: 129 ; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax 130 ; CHECK-NEXT: movups (%eax), %xmm0 131 ; CHECK-NEXT: retl 132 %tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1] 133 %tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1] 134 %tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1] 135 %tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1] 136 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1] 137 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1] 138 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1] 139 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1] 140 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] 141 ret <2 x i64> %tmp16 142 } 143 144 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind { 145 ; CHECK-LABEL: test9: 146 ; CHECK: ## BB#0: 147 ; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0 148 ; CHECK-NEXT: retl 149 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] 150 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] 151 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] 152 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] 153 ret <4 x float> %tmp13 154 } 155 156 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind { 157 ; CHECK-LABEL: test10: 158 ; CHECK: ## BB#0: 159 ; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 160 ; CHECK-NEXT: retl 161 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] 162 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] 163 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] 164 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] 165 ret <4 x float> %tmp13 166 } 167 168 define <2 x double> @test11(double %a, double %b) nounwind { 169 ; CHECK-LABEL: test11: 170 ; CHECK: ## BB#0: 171 ; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 172 ; CHECK-NEXT: retl 173 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1] 174 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1] 175 ret <2 x double> %tmp7 176 } 177 178 define void @test12() nounwind { 179 ; CHECK-LABEL: test12: 180 ; CHECK: ## BB#0: 181 ; CHECK-NEXT: movapd 0, %xmm0 182 ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 183 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 184 ; CHECK-NEXT: xorpd %xmm2, %xmm2 185 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 186 ; CHECK-NEXT: addps %xmm1, %xmm0 187 ; CHECK-NEXT: movaps %xmm0, 0 188 ; CHECK-NEXT: retl 189 %tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2] 190 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 191 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 192 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1] 193 store <4 x float> %tmp4, <4 x float>* null 194 ret void 195 } 196 197 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 198 ; CHECK-LABEL: test13: 199 ; CHECK: ## BB#0: 200 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 201 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 202 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx 203 ; CHECK-NEXT: movaps (%edx), %xmm0 204 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] 205 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 206 ; CHECK-NEXT: movaps %xmm0, (%eax) 207 ; CHECK-NEXT: retl 208 %tmp3 = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=1] 209 %tmp5 = load <4 x float>, <4 x float>* %C ; <<4 x float>> [#uses=1] 210 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] 211 store <4 x float> %tmp11, <4 x float>* %res 212 ret void 213 } 214 215 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { 216 ; CHECK-LABEL: test14: 217 ; CHECK: ## BB#0: 218 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 219 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 220 ; CHECK-NEXT: movaps (%ecx), %xmm1 221 ; CHECK-NEXT: movaps (%eax), %xmm2 222 ; CHECK-NEXT: movaps %xmm2, %xmm0 223 ; CHECK-NEXT: addps %xmm1, %xmm0 224 ; CHECK-NEXT: subps %xmm1, %xmm2 225 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 226 ; CHECK-NEXT: retl 227 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2] 228 %tmp5 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=2] 229 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] 230 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] 231 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1] 232 ret <4 x float> %tmp27 233 } 234 235 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { 236 ; CHECK-LABEL: test15: 237 ; CHECK: ## BB#0: ## %entry 238 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 239 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 240 ; CHECK-NEXT: movapd (%ecx), %xmm0 241 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] 242 ; CHECK-NEXT: retl 243 entry: 244 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=1] 245 %tmp3 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=1] 246 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 247 ret <4 x float> %tmp4 248 } 249 250 ; PR8900 251 252 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) { 253 ; CHECK-LABEL: test16: 254 ; CHECK: ## BB#0: 255 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 256 ; CHECK-NEXT: movapd 96(%eax), %xmm0 257 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 258 ; CHECK-NEXT: retl 259 %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3 260 %i6 = load <4 x double>, <4 x double>* %i5, align 32 261 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2> 262 ret <2 x double> %i7 263 } 264 265 ; PR9009 266 define fastcc void @test17() nounwind { 267 ; CHECK-LABEL: test17: 268 ; CHECK: ## BB#0: ## %entry 269 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768> 270 ; CHECK-NEXT: movaps %xmm0, (%eax) 271 ; CHECK-NEXT: retl 272 entry: 273 %0 = insertelement <4 x i32> undef, i32 undef, i32 1 274 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 275 %2 = bitcast <4 x i32> %1 to <4 x float> 276 store <4 x float> %2, <4 x float> * undef 277 ret void 278 } 279 280 ; PR9210 281 define <4 x float> @f(<4 x double>) nounwind { 282 ; CHECK-LABEL: f: 283 ; CHECK: ## BB#0: ## %entry 284 ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 285 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 286 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 287 ; CHECK-NEXT: retl 288 entry: 289 %double2float.i = fptrunc <4 x double> %0 to <4 x float> 290 ret <4 x float> %double2float.i 291 } 292 293 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { 294 ; CHECK-LABEL: test_insert_64_zext: 295 ; CHECK: ## BB#0: 296 ; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 297 ; CHECK-NEXT: retl 298 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2> 299 ret <2 x i64> %1 300 } 301 302 define <4 x i32> @PR19721(<4 x i32> %i) { 303 ; CHECK-LABEL: PR19721: 304 ; CHECK: ## BB#0: 305 ; CHECK-NEXT: andps LCPI19_0, %xmm0 306 ; CHECK-NEXT: retl 307 %bc = bitcast <4 x i32> %i to i128 308 %insert = and i128 %bc, -4294967296 309 %bc2 = bitcast i128 %insert to <4 x i32> 310 ret <4 x i32> %bc2 311 } 312 313 define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { 314 ; CHECK-LABEL: test_mul: 315 ; CHECK: ## BB#0: 316 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 317 ; CHECK-NEXT: pmuludq %xmm1, %xmm0 318 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 319 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 320 ; CHECK-NEXT: pmuludq %xmm2, %xmm1 321 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 322 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 323 ; CHECK-NEXT: retl 324 %m = mul <4 x i32> %x, %y 325 ret <4 x i32> %m 326 } 327