Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
      2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
      3 
      4 @g16 = external global i16
      5 
      6 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
      7 ; X32-LABEL: pinsrd_1:
      8 ; X32:       ## BB#0:
      9 ; X32-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0
     10 ; X32-NEXT:    retl
     11 ;
     12 ; X64-LABEL: pinsrd_1:
     13 ; X64:       ## BB#0:
     14 ; X64-NEXT:    pinsrd $1, %edi, %xmm0
     15 ; X64-NEXT:    retq
     16   %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
     17   ret <4 x i32> %tmp1
     18 }
     19 
     20 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
     21 ; X32-LABEL: pinsrb_1:
     22 ; X32:       ## BB#0:
     23 ; X32-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
     24 ; X32-NEXT:    retl
     25 ;
     26 ; X64-LABEL: pinsrb_1:
     27 ; X64:       ## BB#0:
     28 ; X64-NEXT:    pinsrb $1, %edi, %xmm0
     29 ; X64-NEXT:    retq
     30   %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
     31   ret <16 x i8> %tmp1
     32 }
     33 
     34 define <2 x i64> @pmovzxbq_1() nounwind {
     35 ; X32-LABEL: pmovzxbq_1:
     36 ; X32:       ## BB#0: ## %entry
     37 ; X32-NEXT:    movl L_g16$non_lazy_ptr, %eax
     38 ; X32-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     39 ; X32-NEXT:    retl
     40 ;
     41 ; X64-LABEL: pmovzxbq_1:
     42 ; X64:       ## BB#0: ## %entry
     43 ; X64-NEXT:    movq _g16@{{.*}}(%rip), %rax
     44 ; X64-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     45 ; X64-NEXT:    retq
     46 entry:
     47 	%0 = load i16, i16* @g16, align 2		; <i16> [#uses=1]
     48 	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
     49 	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
     50 	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
     51 	ret <2 x i64> %3
     52 }
     53 
     54 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
     55 
     56 define i32 @extractps_1(<4 x float> %v) nounwind {
     57 ; X32-LABEL: extractps_1:
     58 ; X32:       ## BB#0:
     59 ; X32-NEXT:    extractps $3, %xmm0, %eax
     60 ; X32-NEXT:    retl
     61 ;
     62 ; X64-LABEL: extractps_1:
     63 ; X64:       ## BB#0:
     64 ; X64-NEXT:    extractps $3, %xmm0, %eax
     65 ; X64-NEXT:    retq
     66   %s = extractelement <4 x float> %v, i32 3
     67   %i = bitcast float %s to i32
     68   ret i32 %i
     69 }
     70 define i32 @extractps_2(<4 x float> %v) nounwind {
     71 ; X32-LABEL: extractps_2:
     72 ; X32:       ## BB#0:
     73 ; X32-NEXT:    extractps $3, %xmm0, %eax
     74 ; X32-NEXT:    retl
     75 ;
     76 ; X64-LABEL: extractps_2:
     77 ; X64:       ## BB#0:
     78 ; X64-NEXT:    extractps $3, %xmm0, %eax
     79 ; X64-NEXT:    retq
     80   %t = bitcast <4 x float> %v to <4 x i32>
     81   %s = extractelement <4 x i32> %t, i32 3
     82   ret i32 %s
     83 }
     84 
     85 
     86 ; The non-store form of extractps puts its result into a GPR.
     87 ; This makes it suitable for an extract from a <4 x float> that
     88 ; is bitcasted to i32, but unsuitable for much of anything else.
     89 
     90 define float @ext_1(<4 x float> %v) nounwind {
     91 ; X32-LABEL: ext_1:
     92 ; X32:       ## BB#0:
     93 ; X32-NEXT:    pushl %eax
     94 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
     95 ; X32-NEXT:    addss LCPI5_0, %xmm0
     96 ; X32-NEXT:    movss %xmm0, (%esp)
     97 ; X32-NEXT:    flds (%esp)
     98 ; X32-NEXT:    popl %eax
     99 ; X32-NEXT:    retl
    100 ;
    101 ; X64-LABEL: ext_1:
    102 ; X64:       ## BB#0:
    103 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    104 ; X64-NEXT:    addss {{.*}}(%rip), %xmm0
    105 ; X64-NEXT:    retq
    106   %s = extractelement <4 x float> %v, i32 3
    107   %t = fadd float %s, 1.0
    108   ret float %t
    109 }
    110 define float @ext_2(<4 x float> %v) nounwind {
    111 ; X32-LABEL: ext_2:
    112 ; X32:       ## BB#0:
    113 ; X32-NEXT:    pushl %eax
    114 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    115 ; X32-NEXT:    movss %xmm0, (%esp)
    116 ; X32-NEXT:    flds (%esp)
    117 ; X32-NEXT:    popl %eax
    118 ; X32-NEXT:    retl
    119 ;
    120 ; X64-LABEL: ext_2:
    121 ; X64:       ## BB#0:
    122 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    123 ; X64-NEXT:    retq
    124   %s = extractelement <4 x float> %v, i32 3
    125   ret float %s
    126 }
    127 define i32 @ext_3(<4 x i32> %v) nounwind {
    128 ; X32-LABEL: ext_3:
    129 ; X32:       ## BB#0:
    130 ; X32-NEXT:    pextrd $3, %xmm0, %eax
    131 ; X32-NEXT:    retl
    132 ;
    133 ; X64-LABEL: ext_3:
    134 ; X64:       ## BB#0:
    135 ; X64-NEXT:    pextrd $3, %xmm0, %eax
    136 ; X64-NEXT:    retq
    137   %i = extractelement <4 x i32> %v, i32 3
    138   ret i32 %i
    139 }
    140 
    141 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
    142 ; X32-LABEL: insertps_1:
    143 ; X32:       ## BB#0:
    144 ; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
    145 ; X32-NEXT:    retl
    146 ;
    147 ; X64-LABEL: insertps_1:
    148 ; X64:       ## BB#0:
    149 ; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
    150 ; X64-NEXT:    retq
    151   %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
    152   ret <4 x float> %tmp1
    153 }
    154 
    155 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
    156 
    157 ; When optimizing for speed, prefer blendps over insertps even if it means we have to
    158 ; generate a separate movss to load the scalar operand.
    159 define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
    160 ; X32-LABEL: blendps_not_insertps_1:
    161 ; X32:       ## BB#0:
    162 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    163 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    164 ; X32-NEXT:    retl
    165 ;
    166 ; X64-LABEL: blendps_not_insertps_1:
    167 ; X64:       ## BB#0:
    168 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    169 ; X64-NEXT:    retq
    170   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
    171   ret <4 x float> %tmp1
    172 }
    173 
    174 ; When optimizing for size, generate an insertps if there's a load fold opportunity.
    175 ; The difference between i386 and x86-64 ABIs for the float operand means we should
    176 ; generate an insertps for X32 but not for X64!
    177 define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
    178 ; X32-LABEL: insertps_or_blendps:
    179 ; X32:       ## BB#0:
    180 ; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
    181 ; X32-NEXT:    retl
    182 ;
    183 ; X64-LABEL: insertps_or_blendps:
    184 ; X64:       ## BB#0:
    185 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    186 ; X64-NEXT:    retq
    187   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
    188   ret <4 x float> %tmp1
    189 }
    190 
    191 ; An insert into the low 32-bits of a vector from the low 32-bits of another vector
    192 ; is always just a blendps because blendps is never more expensive than insertps.
    193 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
    194 ; X32-LABEL: blendps_not_insertps_2:
    195 ; X32:       ## BB#0:
    196 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    197 ; X32-NEXT:    retl
    198 ;
    199 ; X64-LABEL: blendps_not_insertps_2:
    200 ; X64:       ## BB#0:
    201 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    202 ; X64-NEXT:    retq
    203   %tmp2 = extractelement <4 x float> %t2, i32 0
    204   %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
    205   ret <4 x float> %tmp1
    206 }
    207 
    208 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    209 ; X32-LABEL: ptestz_1:
    210 ; X32:       ## BB#0:
    211 ; X32-NEXT:    ptest %xmm1, %xmm0
    212 ; X32-NEXT:    sete %al
    213 ; X32-NEXT:    movzbl %al, %eax
    214 ; X32-NEXT:    retl
    215 ;
    216 ; X64-LABEL: ptestz_1:
    217 ; X64:       ## BB#0:
    218 ; X64-NEXT:    ptest %xmm1, %xmm0
    219 ; X64-NEXT:    sete %al
    220 ; X64-NEXT:    movzbl %al, %eax
    221 ; X64-NEXT:    retq
    222   %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    223   ret i32 %tmp1
    224 }
    225 
    226 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    227 ; X32-LABEL: ptestz_2:
    228 ; X32:       ## BB#0:
    229 ; X32-NEXT:    ptest %xmm1, %xmm0
    230 ; X32-NEXT:    sbbl %eax, %eax
    231 ; X32-NEXT:    andl $1, %eax
    232 ; X32-NEXT:    retl
    233 ;
    234 ; X64-LABEL: ptestz_2:
    235 ; X64:       ## BB#0:
    236 ; X64-NEXT:    ptest %xmm1, %xmm0
    237 ; X64-NEXT:    sbbl %eax, %eax
    238 ; X64-NEXT:    andl $1, %eax
    239 ; X64-NEXT:    retq
    240   %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    241   ret i32 %tmp1
    242 }
    243 
    244 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    245 ; X32-LABEL: ptestz_3:
    246 ; X32:       ## BB#0:
    247 ; X32-NEXT:    ptest %xmm1, %xmm0
    248 ; X32-NEXT:    seta %al
    249 ; X32-NEXT:    movzbl %al, %eax
    250 ; X32-NEXT:    retl
    251 ;
    252 ; X64-LABEL: ptestz_3:
    253 ; X64:       ## BB#0:
    254 ; X64-NEXT:    ptest %xmm1, %xmm0
    255 ; X64-NEXT:    seta %al
    256 ; X64-NEXT:    movzbl %al, %eax
    257 ; X64-NEXT:    retq
    258   %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    259   ret i32 %tmp1
    260 }
    261 
    262 
    263 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
    264 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
    265 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
    266 
    267 ; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
    268 ; pointless.
    269 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
    270 ; X32-LABEL: buildvector:
    271 ; X32:       ## BB#0: ## %entry
    272 ; X32-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    273 ; X32-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    274 ; X32-NEXT:    addss %xmm1, %xmm0
    275 ; X32-NEXT:    addss %xmm2, %xmm3
    276 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
    277 ; X32-NEXT:    retl
    278 ;
    279 ; X64-LABEL: buildvector:
    280 ; X64:       ## BB#0: ## %entry
    281 ; X64-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    282 ; X64-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    283 ; X64-NEXT:    addss %xmm1, %xmm0
    284 ; X64-NEXT:    addss %xmm2, %xmm3
    285 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
    286 ; X64-NEXT:    retq
    287 entry:
    288   %tmp7 = extractelement <2 x float> %A, i32 0
    289   %tmp5 = extractelement <2 x float> %A, i32 1
    290   %tmp3 = extractelement <2 x float> %B, i32 0
    291   %tmp1 = extractelement <2 x float> %B, i32 1
    292   %add.r = fadd float %tmp7, %tmp3
    293   %add.i = fadd float %tmp5, %tmp1
    294   %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
    295   %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
    296   ret <2 x float> %tmp9
    297 }
    298 
    299 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    300 ; X32-LABEL: insertps_from_shufflevector_1:
    301 ; X32:       ## BB#0: ## %entry
    302 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    303 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    304 ; X32-NEXT:    retl
    305 ;
    306 ; X64-LABEL: insertps_from_shufflevector_1:
    307 ; X64:       ## BB#0: ## %entry
    308 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    309 ; X64-NEXT:    retq
    310 entry:
    311   %0 = load <4 x float>, <4 x float>* %pb, align 16
    312   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    313   ret <4 x float> %vecinit6
    314 }
    315 
    316 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
    317 ; X32-LABEL: insertps_from_shufflevector_2:
    318 ; X32:       ## BB#0: ## %entry
    319 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
    320 ; X32-NEXT:    retl
    321 ;
    322 ; X64-LABEL: insertps_from_shufflevector_2:
    323 ; X64:       ## BB#0: ## %entry
    324 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
    325 ; X64-NEXT:    retq
    326 entry:
    327   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
    328   ret <4 x float> %vecinit6
    329 }
    330 
    331 ; For loading an i32 from memory into an xmm register we use pinsrd
    332 ; instead of insertps
    333 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
    334 ; X32-LABEL: pinsrd_from_shufflevector_i32:
    335 ; X32:       ## BB#0: ## %entry
    336 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    337 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
    338 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    339 ; X32-NEXT:    retl
    340 ;
    341 ; X64-LABEL: pinsrd_from_shufflevector_i32:
    342 ; X64:       ## BB#0: ## %entry
    343 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
    344 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    345 ; X64-NEXT:    retq
    346 entry:
    347   %0 = load <4 x i32>, <4 x i32>* %pb, align 16
    348   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    349   ret <4 x i32> %vecinit6
    350 }
    351 
    352 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
    353 ; X32-LABEL: insertps_from_shufflevector_i32_2:
    354 ; X32:       ## BB#0: ## %entry
    355 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    356 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    357 ; X32-NEXT:    retl
    358 ;
    359 ; X64-LABEL: insertps_from_shufflevector_i32_2:
    360 ; X64:       ## BB#0: ## %entry
    361 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    362 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    363 ; X64-NEXT:    retq
    364 entry:
    365   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
    366   ret <4 x i32> %vecinit6
    367 }
    368 
    369 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
    370 ; X32-LABEL: insertps_from_load_ins_elt_undef:
    371 ; X32:       ## BB#0:
    372 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    373 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
    374 ; X32-NEXT:    retl
    375 ;
    376 ; X64-LABEL: insertps_from_load_ins_elt_undef:
    377 ; X64:       ## BB#0:
    378 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
    379 ; X64-NEXT:    retq
    380   %1 = load float, float* %b, align 4
    381   %2 = insertelement <4 x float> undef, float %1, i32 0
    382   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
    383   ret <4 x float> %result
    384 }
    385 
    386 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
    387 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
    388 ; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
    389 ; X32:       ## BB#0:
    390 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    391 ; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    392 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
    393 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
    394 ; X32-NEXT:    retl
    395 ;
    396 ; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
    397 ; X64:       ## BB#0:
    398 ; X64-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    399 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
    400 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
    401 ; X64-NEXT:    retq
    402   %1 = load i32, i32* %b, align 4
    403   %2 = insertelement <4 x i32> undef, i32 %1, i32 0
    404   %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
    405   ret <4 x i32> %result
    406 }
    407 
    408 ;;;;;; Shuffles optimizable with a single insertps or blend instruction
    409 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
    410 ; X32-LABEL: shuf_XYZ0:
    411 ; X32:       ## BB#0:
    412 ; X32-NEXT:    xorps %xmm1, %xmm1
    413 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
    414 ; X32-NEXT:    retl
    415 ;
    416 ; X64-LABEL: shuf_XYZ0:
    417 ; X64:       ## BB#0:
    418 ; X64-NEXT:    xorps %xmm1, %xmm1
    419 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
    420 ; X64-NEXT:    retq
    421   %vecext = extractelement <4 x float> %x, i32 0
    422   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    423   %vecext1 = extractelement <4 x float> %x, i32 1
    424   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    425   %vecext3 = extractelement <4 x float> %x, i32 2
    426   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
    427   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    428   ret <4 x float> %vecinit5
    429 }
    430 
    431 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
    432 ; X32-LABEL: shuf_XY00:
    433 ; X32:       ## BB#0:
    434 ; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    435 ; X32-NEXT:    retl
    436 ;
    437 ; X64-LABEL: shuf_XY00:
    438 ; X64:       ## BB#0:
    439 ; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    440 ; X64-NEXT:    retq
    441   %vecext = extractelement <4 x float> %x, i32 0
    442   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    443   %vecext1 = extractelement <4 x float> %x, i32 1
    444   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    445   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
    446   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
    447   ret <4 x float> %vecinit4
    448 }
    449 
    450 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
    451 ; X32-LABEL: shuf_XYY0:
    452 ; X32:       ## BB#0:
    453 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
    454 ; X32-NEXT:    retl
    455 ;
    456 ; X64-LABEL: shuf_XYY0:
    457 ; X64:       ## BB#0:
    458 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
    459 ; X64-NEXT:    retq
    460   %vecext = extractelement <4 x float> %x, i32 0
    461   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    462   %vecext1 = extractelement <4 x float> %x, i32 1
    463   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    464   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
    465   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    466   ret <4 x float> %vecinit5
    467 }
    468 
    469 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
    470 ; X32-LABEL: shuf_XYW0:
    471 ; X32:       ## BB#0:
    472 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
    473 ; X32-NEXT:    retl
    474 ;
    475 ; X64-LABEL: shuf_XYW0:
    476 ; X64:       ## BB#0:
    477 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
    478 ; X64-NEXT:    retq
    479   %vecext = extractelement <4 x float> %x, i32 0
    480   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    481   %vecext1 = extractelement <4 x float> %x, i32 1
    482   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    483   %vecext2 = extractelement <4 x float> %x, i32 3
    484   %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
    485   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
    486   ret <4 x float> %vecinit4
    487 }
    488 
    489 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
    490 ; X32-LABEL: shuf_W00W:
    491 ; X32:       ## BB#0:
    492 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
    493 ; X32-NEXT:    retl
    494 ;
    495 ; X64-LABEL: shuf_W00W:
    496 ; X64:       ## BB#0:
    497 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
    498 ; X64-NEXT:    retq
    499   %vecext = extractelement <4 x float> %x, i32 3
    500   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    501   %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    502   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
    503   %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
    504   ret <4 x float> %vecinit4
    505 }
    506 
    507 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
    508 ; X32-LABEL: shuf_X00A:
    509 ; X32:       ## BB#0:
    510 ; X32-NEXT:    xorps %xmm2, %xmm2
    511 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
    512 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    513 ; X32-NEXT:    retl
    514 ;
    515 ; X64-LABEL: shuf_X00A:
    516 ; X64:       ## BB#0:
    517 ; X64-NEXT:    xorps %xmm2, %xmm2
    518 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
    519 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    520 ; X64-NEXT:    retq
    521   %vecext = extractelement <4 x float> %x, i32 0
    522   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    523   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    524   %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
    525   %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    526   ret <4 x float> %vecinit4
    527 }
    528 
    529 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
    530 ; X32-LABEL: shuf_X00X:
    531 ; X32:       ## BB#0:
    532 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
    533 ; X32-NEXT:    retl
    534 ;
    535 ; X64-LABEL: shuf_X00X:
    536 ; X64:       ## BB#0:
    537 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
    538 ; X64-NEXT:    retq
    539   %vecext = extractelement <4 x float> %x, i32 0
    540   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    541   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    542   %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
    543   %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    544   ret <4 x float> %vecinit4
    545 }
    546 
    547 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
    548 ; X32-LABEL: shuf_X0YC:
    549 ; X32:       ## BB#0:
    550 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    551 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
    552 ; X32-NEXT:    retl
    553 ;
    554 ; X64-LABEL: shuf_X0YC:
    555 ; X64:       ## BB#0:
    556 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    557 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
    558 ; X64-NEXT:    retq
    559   %vecext = extractelement <4 x float> %x, i32 0
    560   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    561   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    562   %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
    563   %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
    564   ret <4 x float> %vecinit5
    565 }
    566 
    567 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
    568 ; X32-LABEL: i32_shuf_XYZ0:
    569 ; X32:       ## BB#0:
    570 ; X32-NEXT:    pxor %xmm1, %xmm1
    571 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    572 ; X32-NEXT:    retl
    573 ;
    574 ; X64-LABEL: i32_shuf_XYZ0:
    575 ; X64:       ## BB#0:
    576 ; X64-NEXT:    pxor %xmm1, %xmm1
    577 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    578 ; X64-NEXT:    retq
    579   %vecext = extractelement <4 x i32> %x, i32 0
    580   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    581   %vecext1 = extractelement <4 x i32> %x, i32 1
    582   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    583   %vecext3 = extractelement <4 x i32> %x, i32 2
    584   %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
    585   %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
    586   ret <4 x i32> %vecinit5
    587 }
    588 
    589 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
    590 ; X32-LABEL: i32_shuf_XY00:
    591 ; X32:       ## BB#0:
    592 ; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    593 ; X32-NEXT:    retl
    594 ;
    595 ; X64-LABEL: i32_shuf_XY00:
    596 ; X64:       ## BB#0:
    597 ; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    598 ; X64-NEXT:    retq
    599   %vecext = extractelement <4 x i32> %x, i32 0
    600   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    601   %vecext1 = extractelement <4 x i32> %x, i32 1
    602   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    603   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
    604   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
    605   ret <4 x i32> %vecinit4
    606 }
    607 
    608 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
    609 ; X32-LABEL: i32_shuf_XYY0:
    610 ; X32:       ## BB#0:
    611 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
    612 ; X32-NEXT:    pxor %xmm0, %xmm0
    613 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    614 ; X32-NEXT:    retl
    615 ;
    616 ; X64-LABEL: i32_shuf_XYY0:
    617 ; X64:       ## BB#0:
    618 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
    619 ; X64-NEXT:    pxor %xmm0, %xmm0
    620 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    621 ; X64-NEXT:    retq
    622   %vecext = extractelement <4 x i32> %x, i32 0
    623   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    624   %vecext1 = extractelement <4 x i32> %x, i32 1
    625   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    626   %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
    627   %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
    628   ret <4 x i32> %vecinit5
    629 }
    630 
    631 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
    632 ; X32-LABEL: i32_shuf_XYW0:
    633 ; X32:       ## BB#0:
    634 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
    635 ; X32-NEXT:    pxor %xmm0, %xmm0
    636 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    637 ; X32-NEXT:    retl
    638 ;
    639 ; X64-LABEL: i32_shuf_XYW0:
    640 ; X64:       ## BB#0:
    641 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
    642 ; X64-NEXT:    pxor %xmm0, %xmm0
    643 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    644 ; X64-NEXT:    retq
    645   %vecext = extractelement <4 x i32> %x, i32 0
    646   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    647   %vecext1 = extractelement <4 x i32> %x, i32 1
    648   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    649   %vecext2 = extractelement <4 x i32> %x, i32 3
    650   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
    651   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
    652   ret <4 x i32> %vecinit4
    653 }
    654 
    655 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
    656 ; X32-LABEL: i32_shuf_W00W:
    657 ; X32:       ## BB#0:
    658 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    659 ; X32-NEXT:    pxor %xmm0, %xmm0
    660 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
    661 ; X32-NEXT:    retl
    662 ;
    663 ; X64-LABEL: i32_shuf_W00W:
    664 ; X64:       ## BB#0:
    665 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    666 ; X64-NEXT:    pxor %xmm0, %xmm0
    667 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
    668 ; X64-NEXT:    retq
    669   %vecext = extractelement <4 x i32> %x, i32 3
    670   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    671   %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    672   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
    673   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
    674   ret <4 x i32> %vecinit4
    675 }
    676 
    677 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
    678 ; X32-LABEL: i32_shuf_X00A:
    679 ; X32:       ## BB#0:
    680 ; X32-NEXT:    pxor %xmm2, %xmm2
    681 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
    682 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
    683 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    684 ; X32-NEXT:    retl
    685 ;
    686 ; X64-LABEL: i32_shuf_X00A:
    687 ; X64:       ## BB#0:
    688 ; X64-NEXT:    pxor %xmm2, %xmm2
    689 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
    690 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
    691 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    692 ; X64-NEXT:    retq
    693   %vecext = extractelement <4 x i32> %x, i32 0
    694   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    695   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    696   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
    697   %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    698   ret <4 x i32> %vecinit4
    699 }
    700 
    701 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
    702 ; X32-LABEL: i32_shuf_X00X:
    703 ; X32:       ## BB#0:
    704 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
    705 ; X32-NEXT:    pxor %xmm0, %xmm0
    706 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
    707 ; X32-NEXT:    retl
    708 ;
    709 ; X64-LABEL: i32_shuf_X00X:
    710 ; X64:       ## BB#0:
    711 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
    712 ; X64-NEXT:    pxor %xmm0, %xmm0
    713 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
    714 ; X64-NEXT:    retq
    715   %vecext = extractelement <4 x i32> %x, i32 0
    716   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    717   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    718   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
    719   %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    720   ret <4 x i32> %vecinit4
    721 }
    722 
    723 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
    724 ; X32-LABEL: i32_shuf_X0YC:
    725 ; X32:       ## BB#0:
    726 ; X32-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
    727 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
    728 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
    729 ; X32-NEXT:    retl
    730 ;
    731 ; X64-LABEL: i32_shuf_X0YC:
    732 ; X64:       ## BB#0:
    733 ; X64-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
    734 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
    735 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
    736 ; X64-NEXT:    retq
    737   %vecext = extractelement <4 x i32> %x, i32 0
    738   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    739   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    740   %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
    741   %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
    742   ret <4 x i32> %vecinit5
    743 }
    744 
    745 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
    746 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
    747 ; X32-LABEL: test_insertps_no_undef:
    748 ; X32:       ## BB#0:
    749 ; X32-NEXT:    xorps %xmm1, %xmm1
    750 ; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
    751 ; X32-NEXT:    maxps %xmm1, %xmm0
    752 ; X32-NEXT:    retl
    753 ;
    754 ; X64-LABEL: test_insertps_no_undef:
    755 ; X64:       ## BB#0:
    756 ; X64-NEXT:    xorps %xmm1, %xmm1
    757 ; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
    758 ; X64-NEXT:    maxps %xmm1, %xmm0
    759 ; X64-NEXT:    retq
    760   %vecext = extractelement <4 x float> %x, i32 0
    761   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    762   %vecext1 = extractelement <4 x float> %x, i32 1
    763   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    764   %vecext3 = extractelement <4 x float> %x, i32 2
    765   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
    766   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    767   %mask = fcmp olt <4 x float> %vecinit5, %x
    768   %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
    769   ret <4 x float> %res
    770 }
    771 
    772 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
    773 ; X32-LABEL: blendvb_fallback:
    774 ; X32:       ## BB#0:
    775 ; X32-NEXT:    psllw $15, %xmm0
    776 ; X32-NEXT:    psraw $15, %xmm0
    777 ; X32-NEXT:    pblendvb %xmm1, %xmm2
    778 ; X32-NEXT:    movdqa %xmm2, %xmm0
    779 ; X32-NEXT:    retl
    780 ;
    781 ; X64-LABEL: blendvb_fallback:
    782 ; X64:       ## BB#0:
    783 ; X64-NEXT:    psllw $15, %xmm0
    784 ; X64-NEXT:    psraw $15, %xmm0
    785 ; X64-NEXT:    pblendvb %xmm1, %xmm2
    786 ; X64-NEXT:    movdqa %xmm2, %xmm0
    787 ; X64-NEXT:    retq
    788   %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
    789   ret <8 x i16> %ret
    790 }
    791 
    792 ; On X32, account for the argument's move to registers
    793 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    794 ; X32-LABEL: insertps_from_vector_load:
    795 ; X32:       ## BB#0:
    796 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    797 ; X32-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    798 ; X32-NEXT:    retl
    799 ;
    800 ; X64-LABEL: insertps_from_vector_load:
    801 ; X64:       ## BB#0:
    802 ; X64-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    803 ; X64-NEXT:    retq
    804   %1 = load <4 x float>, <4 x float>* %pb, align 16
    805   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
    806   ret <4 x float> %2
    807 }
    808 
    809 ;; Use a non-zero CountS for insertps
    810 ;; Try to match a bit more of the instr, since we need the load's offset.
    811 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    812 ; X32-LABEL: insertps_from_vector_load_offset:
    813 ; X32:       ## BB#0:
    814 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    815 ; X32-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    816 ; X32-NEXT:    retl
    817 ;
    818 ; X64-LABEL: insertps_from_vector_load_offset:
    819 ; X64:       ## BB#0:
    820 ; X64-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    821 ; X64-NEXT:    retq
    822   %1 = load <4 x float>, <4 x float>* %pb, align 16
    823   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
    824   ret <4 x float> %2
    825 }
    826 
    827 ;; Try to match a bit more of the instr, since we need the load's offset.
    828 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
    829 ; X32-LABEL: insertps_from_vector_load_offset_2:
    830 ; X32:       ## BB#0:
    831 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    832 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    833 ; X32-NEXT:    shll $4, %ecx
    834 ; X32-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
    835 ; X32-NEXT:    retl
    836 ;
    837 ; X64-LABEL: insertps_from_vector_load_offset_2:
    838 ; X64:       ## BB#0:
    839 ; X64-NEXT:    shlq $4, %rsi
    840 ; X64-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
    841 ; X64-NEXT:    retq
    842   %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
    843   %2 = load <4 x float>, <4 x float>* %1, align 16
    844   %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
    845   ret <4 x float> %3
    846 }
    847 
    848 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
    849 ; X32-LABEL: insertps_from_broadcast_loadf32:
    850 ; X32:       ## BB#0:
    851 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    852 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    853 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    854 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
    855 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    856 ; X32-NEXT:    retl
    857 ;
    858 ; X64-LABEL: insertps_from_broadcast_loadf32:
    859 ; X64:       ## BB#0:
    860 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    861 ; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
    862 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    863 ; X64-NEXT:    retq
    864   %1 = getelementptr inbounds float, float* %fb, i64 %index
    865   %2 = load float, float* %1, align 4
    866   %3 = insertelement <4 x float> undef, float %2, i32 0
    867   %4 = insertelement <4 x float> %3, float %2, i32 1
    868   %5 = insertelement <4 x float> %4, float %2, i32 2
    869   %6 = insertelement <4 x float> %5, float %2, i32 3
    870   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    871   ret <4 x float> %7
    872 }
    873 
    874 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
    875 ; X32-LABEL: insertps_from_broadcast_loadv4f32:
    876 ; X32:       ## BB#0:
    877 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    878 ; X32-NEXT:    movups (%eax), %xmm1
    879 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
    880 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    881 ; X32-NEXT:    retl
    882 ;
    883 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
    884 ; X64:       ## BB#0:
    885 ; X64-NEXT:    movups (%rdi), %xmm1
    886 ; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
    887 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    888 ; X64-NEXT:    retq
    889   %1 = load <4 x float>, <4 x float>* %b, align 4
    890   %2 = extractelement <4 x float> %1, i32 0
    891   %3 = insertelement <4 x float> undef, float %2, i32 0
    892   %4 = insertelement <4 x float> %3, float %2, i32 1
    893   %5 = insertelement <4 x float> %4, float %2, i32 2
    894   %6 = insertelement <4 x float> %5, float %2, i32 3
    895   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    896   ret <4 x float> %7
    897 }
    898 
    899 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
    900 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
    901 ; X32-LABEL: insertps_from_broadcast_multiple_use:
    902 ; X32:       ## BB#0:
    903 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    904 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    905 ; X32-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
    906 ; X32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
    907 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
    908 ; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
    909 ; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
    910 ; X32-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
    911 ; X32-NEXT:    addps %xmm1, %xmm0
    912 ; X32-NEXT:    addps %xmm2, %xmm3
    913 ; X32-NEXT:    addps %xmm3, %xmm0
    914 ; X32-NEXT:    retl
    915 ;
    916 ; X64-LABEL: insertps_from_broadcast_multiple_use:
    917 ; X64:       ## BB#0:
    918 ; X64-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
    919 ; X64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
    920 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
    921 ; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
    922 ; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
    923 ; X64-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
    924 ; X64-NEXT:    addps %xmm1, %xmm0
    925 ; X64-NEXT:    addps %xmm2, %xmm3
    926 ; X64-NEXT:    addps %xmm3, %xmm0
    927 ; X64-NEXT:    retq
    928   %1 = getelementptr inbounds float, float* %fb, i64 %index
    929   %2 = load float, float* %1, align 4
    930   %3 = insertelement <4 x float> undef, float %2, i32 0
    931   %4 = insertelement <4 x float> %3, float %2, i32 1
    932   %5 = insertelement <4 x float> %4, float %2, i32 2
    933   %6 = insertelement <4 x float> %5, float %2, i32 3
    934   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    935   %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
    936   %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
    937   %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
    938   %11 = fadd <4 x float> %7, %8
    939   %12 = fadd <4 x float> %9, %10
    940   %13 = fadd <4 x float> %11, %12
    941   ret <4 x float> %13
    942 }
    943 
    944 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
    945 ; X32-LABEL: insertps_with_undefs:
    946 ; X32:       ## BB#0:
    947 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    948 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    949 ; X32-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
    950 ; X32-NEXT:    movapd %xmm1, %xmm0
    951 ; X32-NEXT:    retl
    952 ;
    953 ; X64-LABEL: insertps_with_undefs:
    954 ; X64:       ## BB#0:
    955 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    956 ; X64-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
    957 ; X64-NEXT:    movapd %xmm1, %xmm0
    958 ; X64-NEXT:    retq
    959   %1 = load float, float* %b, align 4
    960   %2 = insertelement <4 x float> undef, float %1, i32 0
    961   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
    962   ret <4 x float> %result
    963 }
    964 
    965 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
    966 ; the destination index to change the load, instead of the source index.
    967 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
    968 ; X32-LABEL: pr20087:
    969 ; X32:       ## BB#0:
    970 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    971 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
    972 ; X32-NEXT:    retl
    973 ;
    974 ; X64-LABEL: pr20087:
    975 ; X64:       ## BB#0:
    976 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
    977 ; X64-NEXT:    retq
    978   %load = load <4 x float> , <4 x float> *%ptr
    979   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
    980   ret <4 x float> %ret
    981 }
    982 
    983 ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
    984 define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
    985 ; X32-LABEL: insertps_pr20411:
    986 ; X32:       ## BB#0:
    987 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    988 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    989 ; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    990 ; X32-NEXT:    movdqu %xmm1, (%eax)
    991 ; X32-NEXT:    retl
    992 ;
    993 ; X64-LABEL: insertps_pr20411:
    994 ; X64:       ## BB#0:
    995 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    996 ; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    997 ; X64-NEXT:    movdqu %xmm1, (%rdi)
    998 ; X64-NEXT:    retq
    999   %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
   1000   %ptrcast = bitcast i32* %RET to <4 x i32>*
   1001   store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
   1002   ret void
   1003 }
   1004 
   1005 define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
   1006 ; X32-LABEL: insertps_4:
   1007 ; X32:       ## BB#0: ## %entry
   1008 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
   1009 ; X32-NEXT:    retl
   1010 ;
   1011 ; X64-LABEL: insertps_4:
   1012 ; X64:       ## BB#0: ## %entry
   1013 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
   1014 ; X64-NEXT:    retq
   1015 entry:
   1016   %vecext = extractelement <4 x float> %A, i32 0
   1017   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   1018   %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
   1019   %vecext2 = extractelement <4 x float> %B, i32 2
   1020   %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
   1021   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   1022   ret <4 x float> %vecinit4
   1023 }
   1024 
   1025 define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
   1026 ; X32-LABEL: insertps_5:
   1027 ; X32:       ## BB#0: ## %entry
   1028 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
   1029 ; X32-NEXT:    retl
   1030 ;
   1031 ; X64-LABEL: insertps_5:
   1032 ; X64:       ## BB#0: ## %entry
   1033 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
   1034 ; X64-NEXT:    retq
   1035 entry:
   1036   %vecext = extractelement <4 x float> %A, i32 0
   1037   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   1038   %vecext1 = extractelement <4 x float> %B, i32 1
   1039   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
   1040   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
   1041   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   1042   ret <4 x float> %vecinit4
   1043 }
   1044 
   1045 define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
   1046 ; X32-LABEL: insertps_6:
   1047 ; X32:       ## BB#0: ## %entry
   1048 ; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
   1049 ; X32-NEXT:    retl
   1050 ;
   1051 ; X64-LABEL: insertps_6:
   1052 ; X64:       ## BB#0: ## %entry
   1053 ; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
   1054 ; X64-NEXT:    retq
   1055 entry:
   1056   %vecext = extractelement <4 x float> %A, i32 1
   1057   %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
   1058   %vecext1 = extractelement <4 x float> %B, i32 2
   1059   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
   1060   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
   1061   ret <4 x float> %vecinit3
   1062 }
   1063 
   1064 define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
   1065 ; X32-LABEL: insertps_7:
   1066 ; X32:       ## BB#0: ## %entry
   1067 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
   1068 ; X32-NEXT:    retl
   1069 ;
   1070 ; X64-LABEL: insertps_7:
   1071 ; X64:       ## BB#0: ## %entry
   1072 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
   1073 ; X64-NEXT:    retq
   1074 entry:
   1075   %vecext = extractelement <4 x float> %A, i32 0
   1076   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   1077   %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
   1078   %vecext2 = extractelement <4 x float> %B, i32 1
   1079   %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
   1080   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   1081   ret <4 x float> %vecinit4
   1082 }
   1083 
   1084 define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
   1085 ; X32-LABEL: insertps_8:
   1086 ; X32:       ## BB#0: ## %entry
   1087 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
   1088 ; X32-NEXT:    retl
   1089 ;
   1090 ; X64-LABEL: insertps_8:
   1091 ; X64:       ## BB#0: ## %entry
   1092 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
   1093 ; X64-NEXT:    retq
   1094 entry:
   1095   %vecext = extractelement <4 x float> %A, i32 0
   1096   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   1097   %vecext1 = extractelement <4 x float> %B, i32 0
   1098   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
   1099   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
   1100   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   1101   ret <4 x float> %vecinit4
   1102 }
   1103 
   1104 define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
   1105 ; X32-LABEL: insertps_9:
   1106 ; X32:       ## BB#0: ## %entry
   1107 ; X32-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
   1108 ; X32-NEXT:    movaps %xmm1, %xmm0
   1109 ; X32-NEXT:    retl
   1110 ;
   1111 ; X64-LABEL: insertps_9:
   1112 ; X64:       ## BB#0: ## %entry
   1113 ; X64-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
   1114 ; X64-NEXT:    movaps %xmm1, %xmm0
   1115 ; X64-NEXT:    retq
   1116 entry:
   1117   %vecext = extractelement <4 x float> %A, i32 0
   1118   %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
   1119   %vecext1 = extractelement <4 x float> %B, i32 2
   1120   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
   1121   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
   1122   ret <4 x float> %vecinit3
   1123 }
   1124 
   1125 define <4 x float> @insertps_10(<4 x float> %A)
   1126 ; X32-LABEL: insertps_10:
   1127 ; X32:       ## BB#0:
   1128 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
   1129 ; X32-NEXT:    retl
   1130 ;
   1131 ; X64-LABEL: insertps_10:
   1132 ; X64:       ## BB#0:
   1133 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
   1134 ; X64-NEXT:    retq
   1135 {
   1136   %vecext = extractelement <4 x float> %A, i32 0
   1137   %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
   1138   %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
   1139   ret <4 x float> %vecbuild2
   1140 }
   1141 
   1142 define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
   1143 ; X32-LABEL: build_vector_to_shuffle_1:
   1144 ; X32:       ## BB#0: ## %entry
   1145 ; X32-NEXT:    xorps %xmm1, %xmm1
   1146 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   1147 ; X32-NEXT:    retl
   1148 ;
   1149 ; X64-LABEL: build_vector_to_shuffle_1:
   1150 ; X64:       ## BB#0: ## %entry
   1151 ; X64-NEXT:    xorps %xmm1, %xmm1
   1152 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   1153 ; X64-NEXT:    retq
   1154 entry:
   1155   %vecext = extractelement <4 x float> %A, i32 1
   1156   %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
   1157   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
   1158   %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1159   ret <4 x float> %vecinit3
   1160 }
   1161 
   1162 define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
   1163 ; X32-LABEL: build_vector_to_shuffle_2:
   1164 ; X32:       ## BB#0: ## %entry
   1165 ; X32-NEXT:    xorps %xmm1, %xmm1
   1166 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1167 ; X32-NEXT:    retl
   1168 ;
   1169 ; X64-LABEL: build_vector_to_shuffle_2:
   1170 ; X64:       ## BB#0: ## %entry
   1171 ; X64-NEXT:    xorps %xmm1, %xmm1
   1172 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1173 ; X64-NEXT:    retq
   1174 entry:
   1175   %vecext = extractelement <4 x float> %A, i32 1
   1176   %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
   1177   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
   1178   ret <4 x float> %vecinit1
   1179 }
   1180