1 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32 2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64 3 4 @g16 = external global i16 5 6 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind { 7 ; X32-LABEL: pinsrd_1: 8 ; X32: ## BB#0: 9 ; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 10 ; X32-NEXT: retl 11 ; 12 ; X64-LABEL: pinsrd_1: 13 ; X64: ## BB#0: 14 ; X64-NEXT: pinsrd $1, %edi, %xmm0 15 ; X64-NEXT: retq 16 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 17 ret <4 x i32> %tmp1 18 } 19 20 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind { 21 ; X32-LABEL: pinsrb_1: 22 ; X32: ## BB#0: 23 ; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 24 ; X32-NEXT: retl 25 ; 26 ; X64-LABEL: pinsrb_1: 27 ; X64: ## BB#0: 28 ; X64-NEXT: pinsrb $1, %edi, %xmm0 29 ; X64-NEXT: retq 30 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 31 ret <16 x i8> %tmp1 32 } 33 34 define <2 x i64> @pmovzxbq_1() nounwind { 35 ; X32-LABEL: pmovzxbq_1: 36 ; X32: ## BB#0: ## %entry 37 ; X32-NEXT: movl L_g16$non_lazy_ptr, %eax 38 ; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 39 ; X32-NEXT: retl 40 ; 41 ; X64-LABEL: pmovzxbq_1: 42 ; X64: ## BB#0: ## %entry 43 ; X64-NEXT: movq _g16@{{.*}}(%rip), %rax 44 ; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 45 ; X64-NEXT: retq 46 entry: 47 %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1] 48 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] 49 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] 50 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] 51 ret <2 x i64> %3 52 } 53 54 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone 55 56 define i32 @extractps_1(<4 x float> %v) nounwind { 57 ; X32-LABEL: extractps_1: 58 ; X32: ## BB#0: 59 ; X32-NEXT: extractps $3, %xmm0, %eax 60 ; X32-NEXT: retl 61 ; 62 ; X64-LABEL: extractps_1: 63 ; X64: ## BB#0: 64 ; X64-NEXT: extractps $3, %xmm0, %eax 65 ; X64-NEXT: retq 66 %s = extractelement <4 x float> %v, i32 3 67 %i = bitcast float %s to i32 68 ret i32 %i 69 } 70 define i32 @extractps_2(<4 x float> %v) nounwind { 71 ; X32-LABEL: extractps_2: 72 ; X32: ## BB#0: 73 ; X32-NEXT: extractps $3, %xmm0, %eax 74 ; X32-NEXT: retl 75 ; 76 ; X64-LABEL: extractps_2: 77 ; X64: ## BB#0: 78 ; X64-NEXT: extractps $3, %xmm0, %eax 79 ; X64-NEXT: retq 80 %t = bitcast <4 x float> %v to <4 x i32> 81 %s = extractelement <4 x i32> %t, i32 3 82 ret i32 %s 83 } 84 85 86 ; The non-store form of extractps puts its result into a GPR. 87 ; This makes it suitable for an extract from a <4 x float> that 88 ; is bitcasted to i32, but unsuitable for much of anything else. 89 90 define float @ext_1(<4 x float> %v) nounwind { 91 ; X32-LABEL: ext_1: 92 ; X32: ## BB#0: 93 ; X32-NEXT: pushl %eax 94 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 95 ; X32-NEXT: addss LCPI5_0, %xmm0 96 ; X32-NEXT: movss %xmm0, (%esp) 97 ; X32-NEXT: flds (%esp) 98 ; X32-NEXT: popl %eax 99 ; X32-NEXT: retl 100 ; 101 ; X64-LABEL: ext_1: 102 ; X64: ## BB#0: 103 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 104 ; X64-NEXT: addss {{.*}}(%rip), %xmm0 105 ; X64-NEXT: retq 106 %s = extractelement <4 x float> %v, i32 3 107 %t = fadd float %s, 1.0 108 ret float %t 109 } 110 define float @ext_2(<4 x float> %v) nounwind { 111 ; X32-LABEL: ext_2: 112 ; X32: ## BB#0: 113 ; X32-NEXT: pushl %eax 114 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 115 ; X32-NEXT: movss %xmm0, (%esp) 116 ; X32-NEXT: flds (%esp) 117 ; X32-NEXT: popl %eax 118 ; X32-NEXT: retl 119 ; 120 ; X64-LABEL: ext_2: 121 ; X64: ## BB#0: 122 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 123 ; X64-NEXT: retq 124 %s = extractelement <4 x float> %v, i32 3 125 ret float %s 126 } 127 define i32 @ext_3(<4 x i32> %v) nounwind { 128 ; X32-LABEL: ext_3: 129 ; X32: ## BB#0: 130 ; X32-NEXT: pextrd $3, %xmm0, %eax 131 ; X32-NEXT: retl 132 ; 133 ; X64-LABEL: ext_3: 134 ; X64: ## BB#0: 135 ; X64-NEXT: pextrd $3, %xmm0, %eax 136 ; X64-NEXT: retq 137 %i = extractelement <4 x i32> %v, i32 3 138 ret i32 %i 139 } 140 141 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind { 142 ; X32-LABEL: insertps_1: 143 ; X32: ## BB#0: 144 ; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] 145 ; X32-NEXT: retl 146 ; 147 ; X64-LABEL: insertps_1: 148 ; X64: ## BB#0: 149 ; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] 150 ; X64-NEXT: retq 151 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone 152 ret <4 x float> %tmp1 153 } 154 155 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone 156 157 ; When optimizing for speed, prefer blendps over insertps even if it means we have to 158 ; generate a separate movss to load the scalar operand. 159 define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind { 160 ; X32-LABEL: blendps_not_insertps_1: 161 ; X32: ## BB#0: 162 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 163 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 164 ; X32-NEXT: retl 165 ; 166 ; X64-LABEL: blendps_not_insertps_1: 167 ; X64: ## BB#0: 168 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 169 ; X64-NEXT: retq 170 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 171 ret <4 x float> %tmp1 172 } 173 174 ; When optimizing for size, generate an insertps if there's a load fold opportunity. 175 ; The difference between i386 and x86-64 ABIs for the float operand means we should 176 ; generate an insertps for X32 but not for X64! 177 define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind { 178 ; X32-LABEL: insertps_or_blendps: 179 ; X32: ## BB#0: 180 ; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 181 ; X32-NEXT: retl 182 ; 183 ; X64-LABEL: insertps_or_blendps: 184 ; X64: ## BB#0: 185 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 186 ; X64-NEXT: retq 187 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 188 ret <4 x float> %tmp1 189 } 190 191 ; An insert into the low 32-bits of a vector from the low 32-bits of another vector 192 ; is always just a blendps because blendps is never more expensive than insertps. 193 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind { 194 ; X32-LABEL: blendps_not_insertps_2: 195 ; X32: ## BB#0: 196 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 197 ; X32-NEXT: retl 198 ; 199 ; X64-LABEL: blendps_not_insertps_2: 200 ; X64: ## BB#0: 201 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 202 ; X64-NEXT: retq 203 %tmp2 = extractelement <4 x float> %t2, i32 0 204 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 205 ret <4 x float> %tmp1 206 } 207 208 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind { 209 ; X32-LABEL: ptestz_1: 210 ; X32: ## BB#0: 211 ; X32-NEXT: ptest %xmm1, %xmm0 212 ; X32-NEXT: sete %al 213 ; X32-NEXT: movzbl %al, %eax 214 ; X32-NEXT: retl 215 ; 216 ; X64-LABEL: ptestz_1: 217 ; X64: ## BB#0: 218 ; X64-NEXT: ptest %xmm1, %xmm0 219 ; X64-NEXT: sete %al 220 ; X64-NEXT: movzbl %al, %eax 221 ; X64-NEXT: retq 222 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 223 ret i32 %tmp1 224 } 225 226 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind { 227 ; X32-LABEL: ptestz_2: 228 ; X32: ## BB#0: 229 ; X32-NEXT: ptest %xmm1, %xmm0 230 ; X32-NEXT: sbbl %eax, %eax 231 ; X32-NEXT: andl $1, %eax 232 ; X32-NEXT: retl 233 ; 234 ; X64-LABEL: ptestz_2: 235 ; X64: ## BB#0: 236 ; X64-NEXT: ptest %xmm1, %xmm0 237 ; X64-NEXT: sbbl %eax, %eax 238 ; X64-NEXT: andl $1, %eax 239 ; X64-NEXT: retq 240 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 241 ret i32 %tmp1 242 } 243 244 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind { 245 ; X32-LABEL: ptestz_3: 246 ; X32: ## BB#0: 247 ; X32-NEXT: ptest %xmm1, %xmm0 248 ; X32-NEXT: seta %al 249 ; X32-NEXT: movzbl %al, %eax 250 ; X32-NEXT: retl 251 ; 252 ; X64-LABEL: ptestz_3: 253 ; X64: ## BB#0: 254 ; X64-NEXT: ptest %xmm1, %xmm0 255 ; X64-NEXT: seta %al 256 ; X64-NEXT: movzbl %al, %eax 257 ; X64-NEXT: retq 258 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 259 ret i32 %tmp1 260 } 261 262 263 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone 264 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 265 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone 266 267 ; This used to compile to insertps $0 + insertps $16. insertps $0 is always 268 ; pointless. 269 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { 270 ; X32-LABEL: buildvector: 271 ; X32: ## BB#0: ## %entry 272 ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 273 ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 274 ; X32-NEXT: addss %xmm1, %xmm0 275 ; X32-NEXT: addss %xmm2, %xmm3 276 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 277 ; X32-NEXT: retl 278 ; 279 ; X64-LABEL: buildvector: 280 ; X64: ## BB#0: ## %entry 281 ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 282 ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 283 ; X64-NEXT: addss %xmm1, %xmm0 284 ; X64-NEXT: addss %xmm2, %xmm3 285 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 286 ; X64-NEXT: retq 287 entry: 288 %tmp7 = extractelement <2 x float> %A, i32 0 289 %tmp5 = extractelement <2 x float> %A, i32 1 290 %tmp3 = extractelement <2 x float> %B, i32 0 291 %tmp1 = extractelement <2 x float> %B, i32 1 292 %add.r = fadd float %tmp7, %tmp3 293 %add.i = fadd float %tmp5, %tmp1 294 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 295 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 296 ret <2 x float> %tmp9 297 } 298 299 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 300 ; X32-LABEL: insertps_from_shufflevector_1: 301 ; X32: ## BB#0: ## %entry 302 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 303 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 304 ; X32-NEXT: retl 305 ; 306 ; X64-LABEL: insertps_from_shufflevector_1: 307 ; X64: ## BB#0: ## %entry 308 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 309 ; X64-NEXT: retq 310 entry: 311 %0 = load <4 x float>, <4 x float>* %pb, align 16 312 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 313 ret <4 x float> %vecinit6 314 } 315 316 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { 317 ; X32-LABEL: insertps_from_shufflevector_2: 318 ; X32: ## BB#0: ## %entry 319 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 320 ; X32-NEXT: retl 321 ; 322 ; X64-LABEL: insertps_from_shufflevector_2: 323 ; X64: ## BB#0: ## %entry 324 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 325 ; X64-NEXT: retq 326 entry: 327 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 328 ret <4 x float> %vecinit6 329 } 330 331 ; For loading an i32 from memory into an xmm register we use pinsrd 332 ; instead of insertps 333 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { 334 ; X32-LABEL: pinsrd_from_shufflevector_i32: 335 ; X32: ## BB#0: ## %entry 336 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 337 ; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0] 338 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 339 ; X32-NEXT: retl 340 ; 341 ; X64-LABEL: pinsrd_from_shufflevector_i32: 342 ; X64: ## BB#0: ## %entry 343 ; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0] 344 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 345 ; X64-NEXT: retq 346 entry: 347 %0 = load <4 x i32>, <4 x i32>* %pb, align 16 348 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 349 ret <4 x i32> %vecinit6 350 } 351 352 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { 353 ; X32-LABEL: insertps_from_shufflevector_i32_2: 354 ; X32: ## BB#0: ## %entry 355 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 356 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 357 ; X32-NEXT: retl 358 ; 359 ; X64-LABEL: insertps_from_shufflevector_i32_2: 360 ; X64: ## BB#0: ## %entry 361 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 362 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 363 ; X64-NEXT: retq 364 entry: 365 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> 366 ret <4 x i32> %vecinit6 367 } 368 369 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { 370 ; X32-LABEL: insertps_from_load_ins_elt_undef: 371 ; X32: ## BB#0: 372 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 373 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 374 ; X32-NEXT: retl 375 ; 376 ; X64-LABEL: insertps_from_load_ins_elt_undef: 377 ; X64: ## BB#0: 378 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 379 ; X64-NEXT: retq 380 %1 = load float, float* %b, align 4 381 %2 = insertelement <4 x float> undef, float %1, i32 0 382 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 383 ret <4 x float> %result 384 } 385 386 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr 387 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { 388 ; X32-LABEL: insertps_from_load_ins_elt_undef_i32: 389 ; X32: ## BB#0: 390 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 391 ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 392 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 393 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 394 ; X32-NEXT: retl 395 ; 396 ; X64-LABEL: insertps_from_load_ins_elt_undef_i32: 397 ; X64: ## BB#0: 398 ; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 399 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 400 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 401 ; X64-NEXT: retq 402 %1 = load i32, i32* %b, align 4 403 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 404 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 405 ret <4 x i32> %result 406 } 407 408 ;;;;;; Shuffles optimizable with a single insertps or blend instruction 409 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { 410 ; X32-LABEL: shuf_XYZ0: 411 ; X32: ## BB#0: 412 ; X32-NEXT: xorps %xmm1, %xmm1 413 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 414 ; X32-NEXT: retl 415 ; 416 ; X64-LABEL: shuf_XYZ0: 417 ; X64: ## BB#0: 418 ; X64-NEXT: xorps %xmm1, %xmm1 419 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 420 ; X64-NEXT: retq 421 %vecext = extractelement <4 x float> %x, i32 0 422 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 423 %vecext1 = extractelement <4 x float> %x, i32 1 424 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 425 %vecext3 = extractelement <4 x float> %x, i32 2 426 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 427 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 428 ret <4 x float> %vecinit5 429 } 430 431 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { 432 ; X32-LABEL: shuf_XY00: 433 ; X32: ## BB#0: 434 ; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 435 ; X32-NEXT: retl 436 ; 437 ; X64-LABEL: shuf_XY00: 438 ; X64: ## BB#0: 439 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 440 ; X64-NEXT: retq 441 %vecext = extractelement <4 x float> %x, i32 0 442 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 443 %vecext1 = extractelement <4 x float> %x, i32 1 444 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 445 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 446 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 447 ret <4 x float> %vecinit4 448 } 449 450 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { 451 ; X32-LABEL: shuf_XYY0: 452 ; X32: ## BB#0: 453 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero 454 ; X32-NEXT: retl 455 ; 456 ; X64-LABEL: shuf_XYY0: 457 ; X64: ## BB#0: 458 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero 459 ; X64-NEXT: retq 460 %vecext = extractelement <4 x float> %x, i32 0 461 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 462 %vecext1 = extractelement <4 x float> %x, i32 1 463 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 464 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2 465 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 466 ret <4 x float> %vecinit5 467 } 468 469 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { 470 ; X32-LABEL: shuf_XYW0: 471 ; X32: ## BB#0: 472 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero 473 ; X32-NEXT: retl 474 ; 475 ; X64-LABEL: shuf_XYW0: 476 ; X64: ## BB#0: 477 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero 478 ; X64-NEXT: retq 479 %vecext = extractelement <4 x float> %x, i32 0 480 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 481 %vecext1 = extractelement <4 x float> %x, i32 1 482 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 483 %vecext2 = extractelement <4 x float> %x, i32 3 484 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2 485 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 486 ret <4 x float> %vecinit4 487 } 488 489 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { 490 ; X32-LABEL: shuf_W00W: 491 ; X32: ## BB#0: 492 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] 493 ; X32-NEXT: retl 494 ; 495 ; X64-LABEL: shuf_W00W: 496 ; X64: ## BB#0: 497 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] 498 ; X64-NEXT: retq 499 %vecext = extractelement <4 x float> %x, i32 3 500 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 501 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 502 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 503 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3 504 ret <4 x float> %vecinit4 505 } 506 507 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { 508 ; X32-LABEL: shuf_X00A: 509 ; X32: ## BB#0: 510 ; X32-NEXT: xorps %xmm2, %xmm2 511 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 512 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 513 ; X32-NEXT: retl 514 ; 515 ; X64-LABEL: shuf_X00A: 516 ; X64: ## BB#0: 517 ; X64-NEXT: xorps %xmm2, %xmm2 518 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 519 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 520 ; X64-NEXT: retq 521 %vecext = extractelement <4 x float> %x, i32 0 522 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 523 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 524 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 525 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 526 ret <4 x float> %vecinit4 527 } 528 529 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { 530 ; X32-LABEL: shuf_X00X: 531 ; X32: ## BB#0: 532 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] 533 ; X32-NEXT: retl 534 ; 535 ; X64-LABEL: shuf_X00X: 536 ; X64: ## BB#0: 537 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] 538 ; X64-NEXT: retq 539 %vecext = extractelement <4 x float> %x, i32 0 540 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 541 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 542 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 543 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 544 ret <4 x float> %vecinit4 545 } 546 547 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { 548 ; X32-LABEL: shuf_X0YC: 549 ; X32: ## BB#0: 550 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 551 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] 552 ; X32-NEXT: retl 553 ; 554 ; X64-LABEL: shuf_X0YC: 555 ; X64: ## BB#0: 556 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 557 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] 558 ; X64-NEXT: retq 559 %vecext = extractelement <4 x float> %x, i32 0 560 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 561 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 562 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 563 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 564 ret <4 x float> %vecinit5 565 } 566 567 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { 568 ; X32-LABEL: i32_shuf_XYZ0: 569 ; X32: ## BB#0: 570 ; X32-NEXT: pxor %xmm1, %xmm1 571 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 572 ; X32-NEXT: retl 573 ; 574 ; X64-LABEL: i32_shuf_XYZ0: 575 ; X64: ## BB#0: 576 ; X64-NEXT: pxor %xmm1, %xmm1 577 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 578 ; X64-NEXT: retq 579 %vecext = extractelement <4 x i32> %x, i32 0 580 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 581 %vecext1 = extractelement <4 x i32> %x, i32 1 582 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 583 %vecext3 = extractelement <4 x i32> %x, i32 2 584 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 585 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 586 ret <4 x i32> %vecinit5 587 } 588 589 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { 590 ; X32-LABEL: i32_shuf_XY00: 591 ; X32: ## BB#0: 592 ; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 593 ; X32-NEXT: retl 594 ; 595 ; X64-LABEL: i32_shuf_XY00: 596 ; X64: ## BB#0: 597 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 598 ; X64-NEXT: retq 599 %vecext = extractelement <4 x i32> %x, i32 0 600 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 601 %vecext1 = extractelement <4 x i32> %x, i32 1 602 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 603 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 604 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 605 ret <4 x i32> %vecinit4 606 } 607 608 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { 609 ; X32-LABEL: i32_shuf_XYY0: 610 ; X32: ## BB#0: 611 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 612 ; X32-NEXT: pxor %xmm0, %xmm0 613 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 614 ; X32-NEXT: retl 615 ; 616 ; X64-LABEL: i32_shuf_XYY0: 617 ; X64: ## BB#0: 618 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 619 ; X64-NEXT: pxor %xmm0, %xmm0 620 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 621 ; X64-NEXT: retq 622 %vecext = extractelement <4 x i32> %x, i32 0 623 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 624 %vecext1 = extractelement <4 x i32> %x, i32 1 625 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 626 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2 627 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 628 ret <4 x i32> %vecinit5 629 } 630 631 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { 632 ; X32-LABEL: i32_shuf_XYW0: 633 ; X32: ## BB#0: 634 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3] 635 ; X32-NEXT: pxor %xmm0, %xmm0 636 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 637 ; X32-NEXT: retl 638 ; 639 ; X64-LABEL: i32_shuf_XYW0: 640 ; X64: ## BB#0: 641 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3] 642 ; X64-NEXT: pxor %xmm0, %xmm0 643 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 644 ; X64-NEXT: retq 645 %vecext = extractelement <4 x i32> %x, i32 0 646 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 647 %vecext1 = extractelement <4 x i32> %x, i32 1 648 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 649 %vecext2 = extractelement <4 x i32> %x, i32 3 650 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2 651 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 652 ret <4 x i32> %vecinit4 653 } 654 655 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { 656 ; X32-LABEL: i32_shuf_W00W: 657 ; X32: ## BB#0: 658 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 659 ; X32-NEXT: pxor %xmm0, %xmm0 660 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 661 ; X32-NEXT: retl 662 ; 663 ; X64-LABEL: i32_shuf_W00W: 664 ; X64: ## BB#0: 665 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 666 ; X64-NEXT: pxor %xmm0, %xmm0 667 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 668 ; X64-NEXT: retq 669 %vecext = extractelement <4 x i32> %x, i32 3 670 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 671 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 672 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 673 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3 674 ret <4 x i32> %vecinit4 675 } 676 677 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { 678 ; X32-LABEL: i32_shuf_X00A: 679 ; X32: ## BB#0: 680 ; X32-NEXT: pxor %xmm2, %xmm2 681 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 682 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 683 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 684 ; X32-NEXT: retl 685 ; 686 ; X64-LABEL: i32_shuf_X00A: 687 ; X64: ## BB#0: 688 ; X64-NEXT: pxor %xmm2, %xmm2 689 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 690 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 691 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 692 ; X64-NEXT: retq 693 %vecext = extractelement <4 x i32> %x, i32 0 694 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 695 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 696 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 697 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 698 ret <4 x i32> %vecinit4 699 } 700 701 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { 702 ; X32-LABEL: i32_shuf_X00X: 703 ; X32: ## BB#0: 704 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] 705 ; X32-NEXT: pxor %xmm0, %xmm0 706 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 707 ; X32-NEXT: retl 708 ; 709 ; X64-LABEL: i32_shuf_X00X: 710 ; X64: ## BB#0: 711 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] 712 ; X64-NEXT: pxor %xmm0, %xmm0 713 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 714 ; X64-NEXT: retq 715 %vecext = extractelement <4 x i32> %x, i32 0 716 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 717 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 718 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 719 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 720 ret <4 x i32> %vecinit4 721 } 722 723 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { 724 ; X32-LABEL: i32_shuf_X0YC: 725 ; X32: ## BB#0: 726 ; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 727 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 728 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 729 ; X32-NEXT: retl 730 ; 731 ; X64-LABEL: i32_shuf_X0YC: 732 ; X64: ## BB#0: 733 ; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 734 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 735 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 736 ; X64-NEXT: retq 737 %vecext = extractelement <4 x i32> %x, i32 0 738 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 739 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 740 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 741 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 742 ret <4 x i32> %vecinit5 743 } 744 745 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32 746 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { 747 ; X32-LABEL: test_insertps_no_undef: 748 ; X32: ## BB#0: 749 ; X32-NEXT: xorps %xmm1, %xmm1 750 ; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] 751 ; X32-NEXT: maxps %xmm1, %xmm0 752 ; X32-NEXT: retl 753 ; 754 ; X64-LABEL: test_insertps_no_undef: 755 ; X64: ## BB#0: 756 ; X64-NEXT: xorps %xmm1, %xmm1 757 ; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] 758 ; X64-NEXT: maxps %xmm1, %xmm0 759 ; X64-NEXT: retq 760 %vecext = extractelement <4 x float> %x, i32 0 761 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 762 %vecext1 = extractelement <4 x float> %x, i32 1 763 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 764 %vecext3 = extractelement <4 x float> %x, i32 2 765 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 766 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 767 %mask = fcmp olt <4 x float> %vecinit5, %x 768 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 769 ret <4 x float> %res 770 } 771 772 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { 773 ; X32-LABEL: blendvb_fallback: 774 ; X32: ## BB#0: 775 ; X32-NEXT: psllw $15, %xmm0 776 ; X32-NEXT: psraw $15, %xmm0 777 ; X32-NEXT: pblendvb %xmm1, %xmm2 778 ; X32-NEXT: movdqa %xmm2, %xmm0 779 ; X32-NEXT: retl 780 ; 781 ; X64-LABEL: blendvb_fallback: 782 ; X64: ## BB#0: 783 ; X64-NEXT: psllw $15, %xmm0 784 ; X64-NEXT: psraw $15, %xmm0 785 ; X64-NEXT: pblendvb %xmm1, %xmm2 786 ; X64-NEXT: movdqa %xmm2, %xmm0 787 ; X64-NEXT: retq 788 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y 789 ret <8 x i16> %ret 790 } 791 792 ; On X32, account for the argument's move to registers 793 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 794 ; X32-LABEL: insertps_from_vector_load: 795 ; X32: ## BB#0: 796 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 797 ; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 798 ; X32-NEXT: retl 799 ; 800 ; X64-LABEL: insertps_from_vector_load: 801 ; X64: ## BB#0: 802 ; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 803 ; X64-NEXT: retq 804 %1 = load <4 x float>, <4 x float>* %pb, align 16 805 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) 806 ret <4 x float> %2 807 } 808 809 ;; Use a non-zero CountS for insertps 810 ;; Try to match a bit more of the instr, since we need the load's offset. 811 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 812 ; X32-LABEL: insertps_from_vector_load_offset: 813 ; X32: ## BB#0: 814 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 815 ; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 816 ; X32-NEXT: retl 817 ; 818 ; X64-LABEL: insertps_from_vector_load_offset: 819 ; X64: ## BB#0: 820 ; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 821 ; X64-NEXT: retq 822 %1 = load <4 x float>, <4 x float>* %pb, align 16 823 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) 824 ret <4 x float> %2 825 } 826 827 ;; Try to match a bit more of the instr, since we need the load's offset. 828 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { 829 ; X32-LABEL: insertps_from_vector_load_offset_2: 830 ; X32: ## BB#0: 831 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 832 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 833 ; X32-NEXT: shll $4, %ecx 834 ; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 835 ; X32-NEXT: retl 836 ; 837 ; X64-LABEL: insertps_from_vector_load_offset_2: 838 ; X64: ## BB#0: 839 ; X64-NEXT: shlq $4, %rsi 840 ; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 841 ; X64-NEXT: retq 842 %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index 843 %2 = load <4 x float>, <4 x float>* %1, align 16 844 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) 845 ret <4 x float> %3 846 } 847 848 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { 849 ; X32-LABEL: insertps_from_broadcast_loadf32: 850 ; X32: ## BB#0: 851 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 852 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 853 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 854 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 855 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 856 ; X32-NEXT: retl 857 ; 858 ; X64-LABEL: insertps_from_broadcast_loadf32: 859 ; X64: ## BB#0: 860 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 861 ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 862 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 863 ; X64-NEXT: retq 864 %1 = getelementptr inbounds float, float* %fb, i64 %index 865 %2 = load float, float* %1, align 4 866 %3 = insertelement <4 x float> undef, float %2, i32 0 867 %4 = insertelement <4 x float> %3, float %2, i32 1 868 %5 = insertelement <4 x float> %4, float %2, i32 2 869 %6 = insertelement <4 x float> %5, float %2, i32 3 870 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 871 ret <4 x float> %7 872 } 873 874 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { 875 ; X32-LABEL: insertps_from_broadcast_loadv4f32: 876 ; X32: ## BB#0: 877 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 878 ; X32-NEXT: movups (%eax), %xmm1 879 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 880 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 881 ; X32-NEXT: retl 882 ; 883 ; X64-LABEL: insertps_from_broadcast_loadv4f32: 884 ; X64: ## BB#0: 885 ; X64-NEXT: movups (%rdi), %xmm1 886 ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 887 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 888 ; X64-NEXT: retq 889 %1 = load <4 x float>, <4 x float>* %b, align 4 890 %2 = extractelement <4 x float> %1, i32 0 891 %3 = insertelement <4 x float> undef, float %2, i32 0 892 %4 = insertelement <4 x float> %3, float %2, i32 1 893 %5 = insertelement <4 x float> %4, float %2, i32 2 894 %6 = insertelement <4 x float> %5, float %2, i32 3 895 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 896 ret <4 x float> %7 897 } 898 899 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast. 900 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { 901 ; X32-LABEL: insertps_from_broadcast_multiple_use: 902 ; X32: ## BB#0: 903 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 904 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 905 ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 906 ; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] 907 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 908 ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 909 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] 910 ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 911 ; X32-NEXT: addps %xmm1, %xmm0 912 ; X32-NEXT: addps %xmm2, %xmm3 913 ; X32-NEXT: addps %xmm3, %xmm0 914 ; X32-NEXT: retl 915 ; 916 ; X64-LABEL: insertps_from_broadcast_multiple_use: 917 ; X64: ## BB#0: 918 ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 919 ; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] 920 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 921 ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 922 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] 923 ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 924 ; X64-NEXT: addps %xmm1, %xmm0 925 ; X64-NEXT: addps %xmm2, %xmm3 926 ; X64-NEXT: addps %xmm3, %xmm0 927 ; X64-NEXT: retq 928 %1 = getelementptr inbounds float, float* %fb, i64 %index 929 %2 = load float, float* %1, align 4 930 %3 = insertelement <4 x float> undef, float %2, i32 0 931 %4 = insertelement <4 x float> %3, float %2, i32 1 932 %5 = insertelement <4 x float> %4, float %2, i32 2 933 %6 = insertelement <4 x float> %5, float %2, i32 3 934 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 935 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) 936 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) 937 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) 938 %11 = fadd <4 x float> %7, %8 939 %12 = fadd <4 x float> %9, %10 940 %13 = fadd <4 x float> %11, %12 941 ret <4 x float> %13 942 } 943 944 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { 945 ; X32-LABEL: insertps_with_undefs: 946 ; X32: ## BB#0: 947 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 948 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 949 ; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 950 ; X32-NEXT: movapd %xmm1, %xmm0 951 ; X32-NEXT: retl 952 ; 953 ; X64-LABEL: insertps_with_undefs: 954 ; X64: ## BB#0: 955 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 956 ; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 957 ; X64-NEXT: movapd %xmm1, %xmm0 958 ; X64-NEXT: retq 959 %1 = load float, float* %b, align 4 960 %2 = insertelement <4 x float> undef, float %1, i32 0 961 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7> 962 ret <4 x float> %result 963 } 964 965 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using 966 ; the destination index to change the load, instead of the source index. 967 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { 968 ; X32-LABEL: pr20087: 969 ; X32: ## BB#0: 970 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 971 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] 972 ; X32-NEXT: retl 973 ; 974 ; X64-LABEL: pr20087: 975 ; X64: ## BB#0: 976 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] 977 ; X64-NEXT: retq 978 %load = load <4 x float> , <4 x float> *%ptr 979 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> 980 ret <4 x float> %ret 981 } 982 983 ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> 984 define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 { 985 ; X32-LABEL: insertps_pr20411: 986 ; X32: ## BB#0: 987 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 988 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 989 ; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 990 ; X32-NEXT: movdqu %xmm1, (%eax) 991 ; X32-NEXT: retl 992 ; 993 ; X64-LABEL: insertps_pr20411: 994 ; X64: ## BB#0: 995 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 996 ; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 997 ; X64-NEXT: movdqu %xmm1, (%rdi) 998 ; X64-NEXT: retq 999 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef> 1000 %ptrcast = bitcast i32* %RET to <4 x i32>* 1001 store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 1002 ret void 1003 } 1004 1005 define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { 1006 ; X32-LABEL: insertps_4: 1007 ; X32: ## BB#0: ## %entry 1008 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero 1009 ; X32-NEXT: retl 1010 ; 1011 ; X64-LABEL: insertps_4: 1012 ; X64: ## BB#0: ## %entry 1013 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero 1014 ; X64-NEXT: retq 1015 entry: 1016 %vecext = extractelement <4 x float> %A, i32 0 1017 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1018 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1019 %vecext2 = extractelement <4 x float> %B, i32 2 1020 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1021 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1022 ret <4 x float> %vecinit4 1023 } 1024 1025 define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { 1026 ; X32-LABEL: insertps_5: 1027 ; X32: ## BB#0: ## %entry 1028 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero 1029 ; X32-NEXT: retl 1030 ; 1031 ; X64-LABEL: insertps_5: 1032 ; X64: ## BB#0: ## %entry 1033 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero 1034 ; X64-NEXT: retq 1035 entry: 1036 %vecext = extractelement <4 x float> %A, i32 0 1037 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1038 %vecext1 = extractelement <4 x float> %B, i32 1 1039 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1040 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1041 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1042 ret <4 x float> %vecinit4 1043 } 1044 1045 define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { 1046 ; X32-LABEL: insertps_6: 1047 ; X32: ## BB#0: ## %entry 1048 ; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero 1049 ; X32-NEXT: retl 1050 ; 1051 ; X64-LABEL: insertps_6: 1052 ; X64: ## BB#0: ## %entry 1053 ; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero 1054 ; X64-NEXT: retq 1055 entry: 1056 %vecext = extractelement <4 x float> %A, i32 1 1057 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 1058 %vecext1 = extractelement <4 x float> %B, i32 2 1059 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 1060 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 1061 ret <4 x float> %vecinit3 1062 } 1063 1064 define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { 1065 ; X32-LABEL: insertps_7: 1066 ; X32: ## BB#0: ## %entry 1067 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero 1068 ; X32-NEXT: retl 1069 ; 1070 ; X64-LABEL: insertps_7: 1071 ; X64: ## BB#0: ## %entry 1072 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero 1073 ; X64-NEXT: retq 1074 entry: 1075 %vecext = extractelement <4 x float> %A, i32 0 1076 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1077 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1078 %vecext2 = extractelement <4 x float> %B, i32 1 1079 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1080 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1081 ret <4 x float> %vecinit4 1082 } 1083 1084 define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { 1085 ; X32-LABEL: insertps_8: 1086 ; X32: ## BB#0: ## %entry 1087 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1088 ; X32-NEXT: retl 1089 ; 1090 ; X64-LABEL: insertps_8: 1091 ; X64: ## BB#0: ## %entry 1092 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1093 ; X64-NEXT: retq 1094 entry: 1095 %vecext = extractelement <4 x float> %A, i32 0 1096 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1097 %vecext1 = extractelement <4 x float> %B, i32 0 1098 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1099 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1100 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1101 ret <4 x float> %vecinit4 1102 } 1103 1104 define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { 1105 ; X32-LABEL: insertps_9: 1106 ; X32: ## BB#0: ## %entry 1107 ; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero 1108 ; X32-NEXT: movaps %xmm1, %xmm0 1109 ; X32-NEXT: retl 1110 ; 1111 ; X64-LABEL: insertps_9: 1112 ; X64: ## BB#0: ## %entry 1113 ; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero 1114 ; X64-NEXT: movaps %xmm1, %xmm0 1115 ; X64-NEXT: retq 1116 entry: 1117 %vecext = extractelement <4 x float> %A, i32 0 1118 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 1119 %vecext1 = extractelement <4 x float> %B, i32 2 1120 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 1121 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 1122 ret <4 x float> %vecinit3 1123 } 1124 1125 define <4 x float> @insertps_10(<4 x float> %A) 1126 ; X32-LABEL: insertps_10: 1127 ; X32: ## BB#0: 1128 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero 1129 ; X32-NEXT: retl 1130 ; 1131 ; X64-LABEL: insertps_10: 1132 ; X64: ## BB#0: 1133 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero 1134 ; X64-NEXT: retq 1135 { 1136 %vecext = extractelement <4 x float> %A, i32 0 1137 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0 1138 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2 1139 ret <4 x float> %vecbuild2 1140 } 1141 1142 define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { 1143 ; X32-LABEL: build_vector_to_shuffle_1: 1144 ; X32: ## BB#0: ## %entry 1145 ; X32-NEXT: xorps %xmm1, %xmm1 1146 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1147 ; X32-NEXT: retl 1148 ; 1149 ; X64-LABEL: build_vector_to_shuffle_1: 1150 ; X64: ## BB#0: ## %entry 1151 ; X64-NEXT: xorps %xmm1, %xmm1 1152 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1153 ; X64-NEXT: retq 1154 entry: 1155 %vecext = extractelement <4 x float> %A, i32 1 1156 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 1157 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 1158 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1159 ret <4 x float> %vecinit3 1160 } 1161 1162 define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { 1163 ; X32-LABEL: build_vector_to_shuffle_2: 1164 ; X32: ## BB#0: ## %entry 1165 ; X32-NEXT: xorps %xmm1, %xmm1 1166 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1167 ; X32-NEXT: retl 1168 ; 1169 ; X64-LABEL: build_vector_to_shuffle_2: 1170 ; X64: ## BB#0: ## %entry 1171 ; X64-NEXT: xorps %xmm1, %xmm1 1172 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1173 ; X64-NEXT: retq 1174 entry: 1175 %vecext = extractelement <4 x float> %A, i32 1 1176 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 1177 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 1178 ret <4 x float> %vecinit1 1179 } 1180