Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
      2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
      3 
      4 @g16 = external global i16
      5 
      6 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
      7 ; X32-LABEL: pinsrd_1:
      8 ; X32:       ## BB#0:
      9 ; X32-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0
     10 ; X32-NEXT:    retl
     11 ;
     12 ; X64-LABEL: pinsrd_1:
     13 ; X64:       ## BB#0:
     14 ; X64-NEXT:    pinsrd $1, %edi, %xmm0
     15 ; X64-NEXT:    retq
     16   %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
     17   ret <4 x i32> %tmp1
     18 }
     19 
     20 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
     21 ; X32-LABEL: pinsrb_1:
     22 ; X32:       ## BB#0:
     23 ; X32-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
     24 ; X32-NEXT:    retl
     25 ;
     26 ; X64-LABEL: pinsrb_1:
     27 ; X64:       ## BB#0:
     28 ; X64-NEXT:    pinsrb $1, %edi, %xmm0
     29 ; X64-NEXT:    retq
     30   %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
     31   ret <16 x i8> %tmp1
     32 }
     33 
     34 define <2 x i64> @pmovzxbq_1() nounwind {
     35 ; X32-LABEL: pmovzxbq_1:
     36 ; X32:       ## BB#0: ## %entry
     37 ; X32-NEXT:    movl L_g16$non_lazy_ptr, %eax
     38 ; X32-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     39 ; X32-NEXT:    retl
     40 ;
     41 ; X64-LABEL: pmovzxbq_1:
     42 ; X64:       ## BB#0: ## %entry
     43 ; X64-NEXT:    movq _g16@{{.*}}(%rip), %rax
     44 ; X64-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     45 ; X64-NEXT:    retq
     46 entry:
     47 	%0 = load i16, i16* @g16, align 2		; <i16> [#uses=1]
     48 	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
     49 	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
     50 	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
     51 	ret <2 x i64> %3
     52 }
     53 
     54 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
     55 
     56 define i32 @extractps_1(<4 x float> %v) nounwind {
     57 ; X32-LABEL: extractps_1:
     58 ; X32:       ## BB#0:
     59 ; X32-NEXT:    extractps $3, %xmm0, %eax
     60 ; X32-NEXT:    retl
     61 ;
     62 ; X64-LABEL: extractps_1:
     63 ; X64:       ## BB#0:
     64 ; X64-NEXT:    extractps $3, %xmm0, %eax
     65 ; X64-NEXT:    retq
     66   %s = extractelement <4 x float> %v, i32 3
     67   %i = bitcast float %s to i32
     68   ret i32 %i
     69 }
     70 define i32 @extractps_2(<4 x float> %v) nounwind {
     71 ; X32-LABEL: extractps_2:
     72 ; X32:       ## BB#0:
     73 ; X32-NEXT:    extractps $3, %xmm0, %eax
     74 ; X32-NEXT:    retl
     75 ;
     76 ; X64-LABEL: extractps_2:
     77 ; X64:       ## BB#0:
     78 ; X64-NEXT:    extractps $3, %xmm0, %eax
     79 ; X64-NEXT:    retq
     80   %t = bitcast <4 x float> %v to <4 x i32>
     81   %s = extractelement <4 x i32> %t, i32 3
     82   ret i32 %s
     83 }
     84 
     85 
     86 ; The non-store form of extractps puts its result into a GPR.
     87 ; This makes it suitable for an extract from a <4 x float> that
     88 ; is bitcasted to i32, but unsuitable for much of anything else.
     89 
     90 define float @ext_1(<4 x float> %v) nounwind {
     91 ; X32-LABEL: ext_1:
     92 ; X32:       ## BB#0:
     93 ; X32-NEXT:    pushl %eax
     94 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
     95 ; X32-NEXT:    addss LCPI5_0, %xmm0
     96 ; X32-NEXT:    movss %xmm0, (%esp)
     97 ; X32-NEXT:    flds (%esp)
     98 ; X32-NEXT:    popl %eax
     99 ; X32-NEXT:    retl
    100 ;
    101 ; X64-LABEL: ext_1:
    102 ; X64:       ## BB#0:
    103 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    104 ; X64-NEXT:    addss {{.*}}(%rip), %xmm0
    105 ; X64-NEXT:    retq
    106   %s = extractelement <4 x float> %v, i32 3
    107   %t = fadd float %s, 1.0
    108   ret float %t
    109 }
    110 define float @ext_2(<4 x float> %v) nounwind {
    111 ; X32-LABEL: ext_2:
    112 ; X32:       ## BB#0:
    113 ; X32-NEXT:    pushl %eax
    114 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    115 ; X32-NEXT:    movss %xmm0, (%esp)
    116 ; X32-NEXT:    flds (%esp)
    117 ; X32-NEXT:    popl %eax
    118 ; X32-NEXT:    retl
    119 ;
    120 ; X64-LABEL: ext_2:
    121 ; X64:       ## BB#0:
    122 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    123 ; X64-NEXT:    retq
    124   %s = extractelement <4 x float> %v, i32 3
    125   ret float %s
    126 }
    127 define i32 @ext_3(<4 x i32> %v) nounwind {
    128 ; X32-LABEL: ext_3:
    129 ; X32:       ## BB#0:
    130 ; X32-NEXT:    pextrd $3, %xmm0, %eax
    131 ; X32-NEXT:    retl
    132 ;
    133 ; X64-LABEL: ext_3:
    134 ; X64:       ## BB#0:
    135 ; X64-NEXT:    pextrd $3, %xmm0, %eax
    136 ; X64-NEXT:    retq
    137   %i = extractelement <4 x i32> %v, i32 3
    138   ret i32 %i
    139 }
    140 
    141 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
    142 ; X32-LABEL: insertps_1:
    143 ; X32:       ## BB#0:
    144 ; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
    145 ; X32-NEXT:    retl
    146 ;
    147 ; X64-LABEL: insertps_1:
    148 ; X64:       ## BB#0:
    149 ; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
    150 ; X64-NEXT:    retq
    151   %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
    152   ret <4 x float> %tmp1
    153 }
    154 
    155 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
    156 
    157 ; When optimizing for speed, prefer blendps over insertps even if it means we have to
    158 ; generate a separate movss to load the scalar operand.
    159 define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
    160 ; X32-LABEL: blendps_not_insertps_1:
    161 ; X32:       ## BB#0:
    162 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    163 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    164 ; X32-NEXT:    retl
    165 ;
    166 ; X64-LABEL: blendps_not_insertps_1:
    167 ; X64:       ## BB#0:
    168 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    169 ; X64-NEXT:    retq
    170   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
    171   ret <4 x float> %tmp1
    172 }
    173 
    174 ; When optimizing for size, generate an insertps if there's a load fold opportunity.
    175 ; The difference between i386 and x86-64 ABIs for the float operand means we should
    176 ; generate an insertps for X32 but not for X64!
    177 define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
    178 ; X32-LABEL: insertps_or_blendps:
    179 ; X32:       ## BB#0:
    180 ; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
    181 ; X32-NEXT:    retl
    182 ;
    183 ; X64-LABEL: insertps_or_blendps:
    184 ; X64:       ## BB#0:
    185 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    186 ; X64-NEXT:    retq
    187   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
    188   ret <4 x float> %tmp1
    189 }
    190 
    191 ; An insert into the low 32-bits of a vector from the low 32-bits of another vector
    192 ; is always just a blendps because blendps is never more expensive than insertps.
    193 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
    194 ; X32-LABEL: blendps_not_insertps_2:
    195 ; X32:       ## BB#0:
    196 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    197 ; X32-NEXT:    retl
    198 ;
    199 ; X64-LABEL: blendps_not_insertps_2:
    200 ; X64:       ## BB#0:
    201 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    202 ; X64-NEXT:    retq
    203   %tmp2 = extractelement <4 x float> %t2, i32 0
    204   %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
    205   ret <4 x float> %tmp1
    206 }
    207 
    208 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    209 ; X32-LABEL: ptestz_1:
    210 ; X32:       ## BB#0:
    211 ; X32-NEXT:    xorl %eax, %eax
    212 ; X32-NEXT:    ptest %xmm1, %xmm0
    213 ; X32-NEXT:    sete %al
    214 ; X32-NEXT:    retl
    215 ;
    216 ; X64-LABEL: ptestz_1:
    217 ; X64:       ## BB#0:
    218 ; X64-NEXT:    xorl %eax, %eax
    219 ; X64-NEXT:    ptest %xmm1, %xmm0
    220 ; X64-NEXT:    sete %al
    221 ; X64-NEXT:    retq
    222   %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    223   ret i32 %tmp1
    224 }
    225 
    226 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    227 ; X32-LABEL: ptestz_2:
    228 ; X32:       ## BB#0:
    229 ; X32-NEXT:    ptest %xmm1, %xmm0
    230 ; X32-NEXT:    sbbl %eax, %eax
    231 ; X32-NEXT:    andl $1, %eax
    232 ; X32-NEXT:    retl
    233 ;
    234 ; X64-LABEL: ptestz_2:
    235 ; X64:       ## BB#0:
    236 ; X64-NEXT:    ptest %xmm1, %xmm0
    237 ; X64-NEXT:    sbbl %eax, %eax
    238 ; X64-NEXT:    andl $1, %eax
    239 ; X64-NEXT:    retq
    240   %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    241   ret i32 %tmp1
    242 }
    243 
    244 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
    245 ; X32-LABEL: ptestz_3:
    246 ; X32:       ## BB#0:
    247 ; X32-NEXT:    xorl %eax, %eax
    248 ; X32-NEXT:    ptest %xmm1, %xmm0
    249 ; X32-NEXT:    seta %al
    250 ; X32-NEXT:    retl
    251 ;
    252 ; X64-LABEL: ptestz_3:
    253 ; X64:       ## BB#0:
    254 ; X64-NEXT:    xorl %eax, %eax
    255 ; X64-NEXT:    ptest %xmm1, %xmm0
    256 ; X64-NEXT:    seta %al
    257 ; X64-NEXT:    retq
    258   %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
    259   ret i32 %tmp1
    260 }
    261 
    262 
    263 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
    264 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
    265 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
    266 
    267 ; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
    268 ; pointless.
    269 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
    270 ; X32-LABEL: buildvector:
    271 ; X32:       ## BB#0: ## %entry
    272 ; X32-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    273 ; X32-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    274 ; X32-NEXT:    addss %xmm1, %xmm0
    275 ; X32-NEXT:    addss %xmm2, %xmm3
    276 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
    277 ; X32-NEXT:    retl
    278 ;
    279 ; X64-LABEL: buildvector:
    280 ; X64:       ## BB#0: ## %entry
    281 ; X64-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    282 ; X64-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    283 ; X64-NEXT:    addss %xmm1, %xmm0
    284 ; X64-NEXT:    addss %xmm2, %xmm3
    285 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
    286 ; X64-NEXT:    retq
    287 entry:
    288   %tmp7 = extractelement <2 x float> %A, i32 0
    289   %tmp5 = extractelement <2 x float> %A, i32 1
    290   %tmp3 = extractelement <2 x float> %B, i32 0
    291   %tmp1 = extractelement <2 x float> %B, i32 1
    292   %add.r = fadd float %tmp7, %tmp3
    293   %add.i = fadd float %tmp5, %tmp1
    294   %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
    295   %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
    296   ret <2 x float> %tmp9
    297 }
    298 
    299 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    300 ; X32-LABEL: insertps_from_shufflevector_1:
    301 ; X32:       ## BB#0: ## %entry
    302 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    303 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    304 ; X32-NEXT:    retl
    305 ;
    306 ; X64-LABEL: insertps_from_shufflevector_1:
    307 ; X64:       ## BB#0: ## %entry
    308 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    309 ; X64-NEXT:    retq
    310 entry:
    311   %0 = load <4 x float>, <4 x float>* %pb, align 16
    312   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    313   ret <4 x float> %vecinit6
    314 }
    315 
    316 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
    317 ; X32-LABEL: insertps_from_shufflevector_2:
    318 ; X32:       ## BB#0: ## %entry
    319 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
    320 ; X32-NEXT:    retl
    321 ;
    322 ; X64-LABEL: insertps_from_shufflevector_2:
    323 ; X64:       ## BB#0: ## %entry
    324 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
    325 ; X64-NEXT:    retq
    326 entry:
    327   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
    328   ret <4 x float> %vecinit6
    329 }
    330 
    331 ; For loading an i32 from memory into an xmm register we use pinsrd
    332 ; instead of insertps
    333 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
    334 ; X32-LABEL: pinsrd_from_shufflevector_i32:
    335 ; X32:       ## BB#0: ## %entry
    336 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    337 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
    338 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    339 ; X32-NEXT:    retl
    340 ;
    341 ; X64-LABEL: pinsrd_from_shufflevector_i32:
    342 ; X64:       ## BB#0: ## %entry
    343 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
    344 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    345 ; X64-NEXT:    retq
    346 entry:
    347   %0 = load <4 x i32>, <4 x i32>* %pb, align 16
    348   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    349   ret <4 x i32> %vecinit6
    350 }
    351 
    352 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
    353 ; X32-LABEL: insertps_from_shufflevector_i32_2:
    354 ; X32:       ## BB#0: ## %entry
    355 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    356 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    357 ; X32-NEXT:    retl
    358 ;
    359 ; X64-LABEL: insertps_from_shufflevector_i32_2:
    360 ; X64:       ## BB#0: ## %entry
    361 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    362 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    363 ; X64-NEXT:    retq
    364 entry:
    365   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
    366   ret <4 x i32> %vecinit6
    367 }
    368 
    369 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
    370 ; X32-LABEL: insertps_from_load_ins_elt_undef:
    371 ; X32:       ## BB#0:
    372 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    373 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
    374 ; X32-NEXT:    retl
    375 ;
    376 ; X64-LABEL: insertps_from_load_ins_elt_undef:
    377 ; X64:       ## BB#0:
    378 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
    379 ; X64-NEXT:    retq
    380   %1 = load float, float* %b, align 4
    381   %2 = insertelement <4 x float> undef, float %1, i32 0
    382   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
    383   ret <4 x float> %result
    384 }
    385 
    386 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
    387 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
    388 ; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
    389 ; X32:       ## BB#0:
    390 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    391 ; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    392 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
    393 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
    394 ; X32-NEXT:    retl
    395 ;
    396 ; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
    397 ; X64:       ## BB#0:
    398 ; X64-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    399 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
    400 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
    401 ; X64-NEXT:    retq
    402   %1 = load i32, i32* %b, align 4
    403   %2 = insertelement <4 x i32> undef, i32 %1, i32 0
    404   %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
    405   ret <4 x i32> %result
    406 }
    407 
    408 ;;;;;; Shuffles optimizable with a single insertps or blend instruction
    409 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
    410 ; X32-LABEL: shuf_XYZ0:
    411 ; X32:       ## BB#0:
    412 ; X32-NEXT:    xorps %xmm1, %xmm1
    413 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
    414 ; X32-NEXT:    retl
    415 ;
    416 ; X64-LABEL: shuf_XYZ0:
    417 ; X64:       ## BB#0:
    418 ; X64-NEXT:    xorps %xmm1, %xmm1
    419 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
    420 ; X64-NEXT:    retq
    421   %vecext = extractelement <4 x float> %x, i32 0
    422   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    423   %vecext1 = extractelement <4 x float> %x, i32 1
    424   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    425   %vecext3 = extractelement <4 x float> %x, i32 2
    426   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
    427   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    428   ret <4 x float> %vecinit5
    429 }
    430 
    431 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
    432 ; X32-LABEL: shuf_XY00:
    433 ; X32:       ## BB#0:
    434 ; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    435 ; X32-NEXT:    retl
    436 ;
    437 ; X64-LABEL: shuf_XY00:
    438 ; X64:       ## BB#0:
    439 ; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    440 ; X64-NEXT:    retq
    441   %vecext = extractelement <4 x float> %x, i32 0
    442   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    443   %vecext1 = extractelement <4 x float> %x, i32 1
    444   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    445   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
    446   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
    447   ret <4 x float> %vecinit4
    448 }
    449 
    450 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
    451 ; X32-LABEL: shuf_XYY0:
    452 ; X32:       ## BB#0:
    453 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
    454 ; X32-NEXT:    retl
    455 ;
    456 ; X64-LABEL: shuf_XYY0:
    457 ; X64:       ## BB#0:
    458 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
    459 ; X64-NEXT:    retq
    460   %vecext = extractelement <4 x float> %x, i32 0
    461   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    462   %vecext1 = extractelement <4 x float> %x, i32 1
    463   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    464   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
    465   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    466   ret <4 x float> %vecinit5
    467 }
    468 
    469 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
    470 ; X32-LABEL: shuf_XYW0:
    471 ; X32:       ## BB#0:
    472 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
    473 ; X32-NEXT:    retl
    474 ;
    475 ; X64-LABEL: shuf_XYW0:
    476 ; X64:       ## BB#0:
    477 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
    478 ; X64-NEXT:    retq
    479   %vecext = extractelement <4 x float> %x, i32 0
    480   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    481   %vecext1 = extractelement <4 x float> %x, i32 1
    482   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    483   %vecext2 = extractelement <4 x float> %x, i32 3
    484   %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
    485   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
    486   ret <4 x float> %vecinit4
    487 }
    488 
    489 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
    490 ; X32-LABEL: shuf_W00W:
    491 ; X32:       ## BB#0:
    492 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
    493 ; X32-NEXT:    retl
    494 ;
    495 ; X64-LABEL: shuf_W00W:
    496 ; X64:       ## BB#0:
    497 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
    498 ; X64-NEXT:    retq
    499   %vecext = extractelement <4 x float> %x, i32 3
    500   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    501   %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    502   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
    503   %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
    504   ret <4 x float> %vecinit4
    505 }
    506 
    507 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
    508 ; X32-LABEL: shuf_X00A:
    509 ; X32:       ## BB#0:
    510 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
    511 ; X32-NEXT:    retl
    512 ;
    513 ; X64-LABEL: shuf_X00A:
    514 ; X64:       ## BB#0:
    515 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
    516 ; X64-NEXT:    retq
    517   %vecext = extractelement <4 x float> %x, i32 0
    518   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    519   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    520   %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
    521   %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    522   ret <4 x float> %vecinit4
    523 }
    524 
    525 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
    526 ; X32-LABEL: shuf_X00X:
    527 ; X32:       ## BB#0:
    528 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
    529 ; X32-NEXT:    retl
    530 ;
    531 ; X64-LABEL: shuf_X00X:
    532 ; X64:       ## BB#0:
    533 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
    534 ; X64-NEXT:    retq
    535   %vecext = extractelement <4 x float> %x, i32 0
    536   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    537   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    538   %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
    539   %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    540   ret <4 x float> %vecinit4
    541 }
    542 
    543 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
    544 ; X32-LABEL: shuf_X0YC:
    545 ; X32:       ## BB#0:
    546 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    547 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
    548 ; X32-NEXT:    retl
    549 ;
    550 ; X64-LABEL: shuf_X0YC:
    551 ; X64:       ## BB#0:
    552 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    553 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
    554 ; X64-NEXT:    retq
    555   %vecext = extractelement <4 x float> %x, i32 0
    556   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    557   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
    558   %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
    559   %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
    560   ret <4 x float> %vecinit5
    561 }
    562 
    563 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
    564 ; X32-LABEL: i32_shuf_XYZ0:
    565 ; X32:       ## BB#0:
    566 ; X32-NEXT:    pxor %xmm1, %xmm1
    567 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    568 ; X32-NEXT:    retl
    569 ;
    570 ; X64-LABEL: i32_shuf_XYZ0:
    571 ; X64:       ## BB#0:
    572 ; X64-NEXT:    pxor %xmm1, %xmm1
    573 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    574 ; X64-NEXT:    retq
    575   %vecext = extractelement <4 x i32> %x, i32 0
    576   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    577   %vecext1 = extractelement <4 x i32> %x, i32 1
    578   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    579   %vecext3 = extractelement <4 x i32> %x, i32 2
    580   %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
    581   %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
    582   ret <4 x i32> %vecinit5
    583 }
    584 
    585 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
    586 ; X32-LABEL: i32_shuf_XY00:
    587 ; X32:       ## BB#0:
    588 ; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    589 ; X32-NEXT:    retl
    590 ;
    591 ; X64-LABEL: i32_shuf_XY00:
    592 ; X64:       ## BB#0:
    593 ; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    594 ; X64-NEXT:    retq
    595   %vecext = extractelement <4 x i32> %x, i32 0
    596   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    597   %vecext1 = extractelement <4 x i32> %x, i32 1
    598   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    599   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
    600   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
    601   ret <4 x i32> %vecinit4
    602 }
    603 
    604 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
    605 ; X32-LABEL: i32_shuf_XYY0:
    606 ; X32:       ## BB#0:
    607 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
    608 ; X32-NEXT:    pxor %xmm0, %xmm0
    609 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    610 ; X32-NEXT:    retl
    611 ;
    612 ; X64-LABEL: i32_shuf_XYY0:
    613 ; X64:       ## BB#0:
    614 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
    615 ; X64-NEXT:    pxor %xmm0, %xmm0
    616 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    617 ; X64-NEXT:    retq
    618   %vecext = extractelement <4 x i32> %x, i32 0
    619   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    620   %vecext1 = extractelement <4 x i32> %x, i32 1
    621   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    622   %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
    623   %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
    624   ret <4 x i32> %vecinit5
    625 }
    626 
    627 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
    628 ; X32-LABEL: i32_shuf_XYW0:
    629 ; X32:       ## BB#0:
    630 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
    631 ; X32-NEXT:    pxor %xmm0, %xmm0
    632 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    633 ; X32-NEXT:    retl
    634 ;
    635 ; X64-LABEL: i32_shuf_XYW0:
    636 ; X64:       ## BB#0:
    637 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
    638 ; X64-NEXT:    pxor %xmm0, %xmm0
    639 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    640 ; X64-NEXT:    retq
    641   %vecext = extractelement <4 x i32> %x, i32 0
    642   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    643   %vecext1 = extractelement <4 x i32> %x, i32 1
    644   %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
    645   %vecext2 = extractelement <4 x i32> %x, i32 3
    646   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
    647   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
    648   ret <4 x i32> %vecinit4
    649 }
    650 
    651 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
    652 ; X32-LABEL: i32_shuf_W00W:
    653 ; X32:       ## BB#0:
    654 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    655 ; X32-NEXT:    pxor %xmm0, %xmm0
    656 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
    657 ; X32-NEXT:    retl
    658 ;
    659 ; X64-LABEL: i32_shuf_W00W:
    660 ; X64:       ## BB#0:
    661 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    662 ; X64-NEXT:    pxor %xmm0, %xmm0
    663 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
    664 ; X64-NEXT:    retq
    665   %vecext = extractelement <4 x i32> %x, i32 3
    666   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    667   %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    668   %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
    669   %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
    670   ret <4 x i32> %vecinit4
    671 }
    672 
    673 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
    674 ; X32-LABEL: i32_shuf_X00A:
    675 ; X32:       ## BB#0:
    676 ; X32-NEXT:    pxor %xmm2, %xmm2
    677 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
    678 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
    679 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    680 ; X32-NEXT:    retl
    681 ;
    682 ; X64-LABEL: i32_shuf_X00A:
    683 ; X64:       ## BB#0:
    684 ; X64-NEXT:    pxor %xmm2, %xmm2
    685 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
    686 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
    687 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
    688 ; X64-NEXT:    retq
    689   %vecext = extractelement <4 x i32> %x, i32 0
    690   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    691   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    692   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
    693   %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    694   ret <4 x i32> %vecinit4
    695 }
    696 
    697 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
    698 ; X32-LABEL: i32_shuf_X00X:
    699 ; X32:       ## BB#0:
    700 ; X32-NEXT:    pxor %xmm1, %xmm1
    701 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
    702 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
    703 ; X32-NEXT:    retl
    704 ;
    705 ; X64-LABEL: i32_shuf_X00X:
    706 ; X64:       ## BB#0:
    707 ; X64-NEXT:    pxor %xmm1, %xmm1
    708 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
    709 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
    710 ; X64-NEXT:    retq
    711   %vecext = extractelement <4 x i32> %x, i32 0
    712   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    713   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    714   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
    715   %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
    716   ret <4 x i32> %vecinit4
    717 }
    718 
    719 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
    720 ; X32-LABEL: i32_shuf_X0YC:
    721 ; X32:       ## BB#0:
    722 ; X32-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
    723 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
    724 ; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
    725 ; X32-NEXT:    retl
    726 ;
    727 ; X64-LABEL: i32_shuf_X0YC:
    728 ; X64:       ## BB#0:
    729 ; X64-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
    730 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
    731 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
    732 ; X64-NEXT:    retq
    733   %vecext = extractelement <4 x i32> %x, i32 0
    734   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
    735   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
    736   %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
    737   %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
    738   ret <4 x i32> %vecinit5
    739 }
    740 
    741 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
    742 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
    743 ; X32-LABEL: test_insertps_no_undef:
    744 ; X32:       ## BB#0:
    745 ; X32-NEXT:    xorps %xmm1, %xmm1
    746 ; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
    747 ; X32-NEXT:    maxps %xmm1, %xmm0
    748 ; X32-NEXT:    retl
    749 ;
    750 ; X64-LABEL: test_insertps_no_undef:
    751 ; X64:       ## BB#0:
    752 ; X64-NEXT:    xorps %xmm1, %xmm1
    753 ; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
    754 ; X64-NEXT:    maxps %xmm1, %xmm0
    755 ; X64-NEXT:    retq
    756   %vecext = extractelement <4 x float> %x, i32 0
    757   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
    758   %vecext1 = extractelement <4 x float> %x, i32 1
    759   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
    760   %vecext3 = extractelement <4 x float> %x, i32 2
    761   %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
    762   %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
    763   %mask = fcmp olt <4 x float> %vecinit5, %x
    764   %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
    765   ret <4 x float> %res
    766 }
    767 
    768 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
    769 ; X32-LABEL: blendvb_fallback:
    770 ; X32:       ## BB#0:
    771 ; X32-NEXT:    psllw $15, %xmm0
    772 ; X32-NEXT:    psraw $15, %xmm0
    773 ; X32-NEXT:    pblendvb %xmm1, %xmm2
    774 ; X32-NEXT:    movdqa %xmm2, %xmm0
    775 ; X32-NEXT:    retl
    776 ;
    777 ; X64-LABEL: blendvb_fallback:
    778 ; X64:       ## BB#0:
    779 ; X64-NEXT:    psllw $15, %xmm0
    780 ; X64-NEXT:    psraw $15, %xmm0
    781 ; X64-NEXT:    pblendvb %xmm1, %xmm2
    782 ; X64-NEXT:    movdqa %xmm2, %xmm0
    783 ; X64-NEXT:    retq
    784   %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
    785   ret <8 x i16> %ret
    786 }
    787 
    788 ; On X32, account for the argument's move to registers
    789 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    790 ; X32-LABEL: insertps_from_vector_load:
    791 ; X32:       ## BB#0:
    792 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    793 ; X32-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    794 ; X32-NEXT:    retl
    795 ;
    796 ; X64-LABEL: insertps_from_vector_load:
    797 ; X64:       ## BB#0:
    798 ; X64-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    799 ; X64-NEXT:    retq
    800   %1 = load <4 x float>, <4 x float>* %pb, align 16
    801   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
    802   ret <4 x float> %2
    803 }
    804 
    805 ;; Use a non-zero CountS for insertps
    806 ;; Try to match a bit more of the instr, since we need the load's offset.
    807 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
    808 ; X32-LABEL: insertps_from_vector_load_offset:
    809 ; X32:       ## BB#0:
    810 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    811 ; X32-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    812 ; X32-NEXT:    retl
    813 ;
    814 ; X64-LABEL: insertps_from_vector_load_offset:
    815 ; X64:       ## BB#0:
    816 ; X64-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    817 ; X64-NEXT:    retq
    818   %1 = load <4 x float>, <4 x float>* %pb, align 16
    819   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
    820   ret <4 x float> %2
    821 }
    822 
    823 ;; Try to match a bit more of the instr, since we need the load's offset.
    824 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
    825 ; X32-LABEL: insertps_from_vector_load_offset_2:
    826 ; X32:       ## BB#0:
    827 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    828 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    829 ; X32-NEXT:    shll $4, %ecx
    830 ; X32-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
    831 ; X32-NEXT:    retl
    832 ;
    833 ; X64-LABEL: insertps_from_vector_load_offset_2:
    834 ; X64:       ## BB#0:
    835 ; X64-NEXT:    shlq $4, %rsi
    836 ; X64-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
    837 ; X64-NEXT:    retq
    838   %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
    839   %2 = load <4 x float>, <4 x float>* %1, align 16
    840   %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
    841   ret <4 x float> %3
    842 }
    843 
    844 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
    845 ; X32-LABEL: insertps_from_broadcast_loadf32:
    846 ; X32:       ## BB#0:
    847 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    848 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    849 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    850 ; X32-NEXT:    retl
    851 ;
    852 ; X64-LABEL: insertps_from_broadcast_loadf32:
    853 ; X64:       ## BB#0:
    854 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    855 ; X64-NEXT:    retq
    856   %1 = getelementptr inbounds float, float* %fb, i64 %index
    857   %2 = load float, float* %1, align 4
    858   %3 = insertelement <4 x float> undef, float %2, i32 0
    859   %4 = insertelement <4 x float> %3, float %2, i32 1
    860   %5 = insertelement <4 x float> %4, float %2, i32 2
    861   %6 = insertelement <4 x float> %5, float %2, i32 3
    862   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    863   ret <4 x float> %7
    864 }
    865 
    866 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
    867 ; X32-LABEL: insertps_from_broadcast_loadv4f32:
    868 ; X32:       ## BB#0:
    869 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    870 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    871 ; X32-NEXT:    retl
    872 ;
    873 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
    874 ; X64:       ## BB#0:
    875 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    876 ; X64-NEXT:    retq
    877   %1 = load <4 x float>, <4 x float>* %b, align 4
    878   %2 = extractelement <4 x float> %1, i32 0
    879   %3 = insertelement <4 x float> undef, float %2, i32 0
    880   %4 = insertelement <4 x float> %3, float %2, i32 1
    881   %5 = insertelement <4 x float> %4, float %2, i32 2
    882   %6 = insertelement <4 x float> %5, float %2, i32 3
    883   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    884   ret <4 x float> %7
    885 }
    886 
    887 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
    888 ; X32-LABEL: insertps_from_broadcast_multiple_use:
    889 ; X32:       ## BB#0:
    890 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    891 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    892 ; X32-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
    893 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
    894 ; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
    895 ; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
    896 ; X32-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
    897 ; X32-NEXT:    addps %xmm1, %xmm0
    898 ; X32-NEXT:    addps %xmm2, %xmm3
    899 ; X32-NEXT:    addps %xmm3, %xmm0
    900 ; X32-NEXT:    retl
    901 ;
    902 ; X64-LABEL: insertps_from_broadcast_multiple_use:
    903 ; X64:       ## BB#0:
    904 ; X64-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
    905 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
    906 ; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
    907 ; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
    908 ; X64-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
    909 ; X64-NEXT:    addps %xmm1, %xmm0
    910 ; X64-NEXT:    addps %xmm2, %xmm3
    911 ; X64-NEXT:    addps %xmm3, %xmm0
    912 ; X64-NEXT:    retq
    913   %1 = getelementptr inbounds float, float* %fb, i64 %index
    914   %2 = load float, float* %1, align 4
    915   %3 = insertelement <4 x float> undef, float %2, i32 0
    916   %4 = insertelement <4 x float> %3, float %2, i32 1
    917   %5 = insertelement <4 x float> %4, float %2, i32 2
    918   %6 = insertelement <4 x float> %5, float %2, i32 3
    919   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
    920   %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
    921   %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
    922   %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
    923   %11 = fadd <4 x float> %7, %8
    924   %12 = fadd <4 x float> %9, %10
    925   %13 = fadd <4 x float> %11, %12
    926   ret <4 x float> %13
    927 }
    928 
    929 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
    930 ; X32-LABEL: insertps_with_undefs:
    931 ; X32:       ## BB#0:
    932 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    933 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    934 ; X32-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
    935 ; X32-NEXT:    movapd %xmm1, %xmm0
    936 ; X32-NEXT:    retl
    937 ;
    938 ; X64-LABEL: insertps_with_undefs:
    939 ; X64:       ## BB#0:
    940 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    941 ; X64-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
    942 ; X64-NEXT:    movapd %xmm1, %xmm0
    943 ; X64-NEXT:    retq
    944   %1 = load float, float* %b, align 4
    945   %2 = insertelement <4 x float> undef, float %1, i32 0
    946   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
    947   ret <4 x float> %result
    948 }
    949 
    950 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
    951 ; the destination index to change the load, instead of the source index.
    952 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
    953 ; X32-LABEL: pr20087:
    954 ; X32:       ## BB#0:
    955 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    956 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
    957 ; X32-NEXT:    retl
    958 ;
    959 ; X64-LABEL: pr20087:
    960 ; X64:       ## BB#0:
    961 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
    962 ; X64-NEXT:    retq
    963   %load = load <4 x float> , <4 x float> *%ptr
    964   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
    965   ret <4 x float> %ret
    966 }
    967 
    968 ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
    969 define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
    970 ; X32-LABEL: insertps_pr20411:
    971 ; X32:       ## BB#0:
    972 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    973 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    974 ; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    975 ; X32-NEXT:    movdqu %xmm1, (%eax)
    976 ; X32-NEXT:    retl
    977 ;
    978 ; X64-LABEL: insertps_pr20411:
    979 ; X64:       ## BB#0:
    980 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    981 ; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    982 ; X64-NEXT:    movdqu %xmm1, (%rdi)
    983 ; X64-NEXT:    retq
    984   %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
    985   %ptrcast = bitcast i32* %RET to <4 x i32>*
    986   store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
    987   ret void
    988 }
    989 
    990 define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
    991 ; X32-LABEL: insertps_4:
    992 ; X32:       ## BB#0: ## %entry
    993 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
    994 ; X32-NEXT:    retl
    995 ;
    996 ; X64-LABEL: insertps_4:
    997 ; X64:       ## BB#0: ## %entry
    998 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
    999 ; X64-NEXT:    retq
   1000 entry:
   1001   %vecext = extractelement <4 x float> %A, i32 0
   1002   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   1003   %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
   1004   %vecext2 = extractelement <4 x float> %B, i32 2
   1005   %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
   1006   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   1007   ret <4 x float> %vecinit4
   1008 }
   1009 
   1010 define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
   1011 ; X32-LABEL: insertps_5:
   1012 ; X32:       ## BB#0: ## %entry
   1013 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
   1014 ; X32-NEXT:    retl
   1015 ;
   1016 ; X64-LABEL: insertps_5:
   1017 ; X64:       ## BB#0: ## %entry
   1018 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
   1019 ; X64-NEXT:    retq
   1020 entry:
   1021   %vecext = extractelement <4 x float> %A, i32 0
   1022   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   1023   %vecext1 = extractelement <4 x float> %B, i32 1
   1024   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
   1025   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
   1026   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   1027   ret <4 x float> %vecinit4
   1028 }
   1029 
   1030 define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
   1031 ; X32-LABEL: insertps_6:
   1032 ; X32:       ## BB#0: ## %entry
   1033 ; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
   1034 ; X32-NEXT:    retl
   1035 ;
   1036 ; X64-LABEL: insertps_6:
   1037 ; X64:       ## BB#0: ## %entry
   1038 ; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
   1039 ; X64-NEXT:    retq
   1040 entry:
   1041   %vecext = extractelement <4 x float> %A, i32 1
   1042   %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
   1043   %vecext1 = extractelement <4 x float> %B, i32 2
   1044   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
   1045   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
   1046   ret <4 x float> %vecinit3
   1047 }
   1048 
   1049 define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
   1050 ; X32-LABEL: insertps_7:
   1051 ; X32:       ## BB#0: ## %entry
   1052 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
   1053 ; X32-NEXT:    retl
   1054 ;
   1055 ; X64-LABEL: insertps_7:
   1056 ; X64:       ## BB#0: ## %entry
   1057 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
   1058 ; X64-NEXT:    retq
   1059 entry:
   1060   %vecext = extractelement <4 x float> %A, i32 0
   1061   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   1062   %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
   1063   %vecext2 = extractelement <4 x float> %B, i32 1
   1064   %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
   1065   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   1066   ret <4 x float> %vecinit4
   1067 }
   1068 
   1069 define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
   1070 ; X32-LABEL: insertps_8:
   1071 ; X32:       ## BB#0: ## %entry
   1072 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
   1073 ; X32-NEXT:    retl
   1074 ;
   1075 ; X64-LABEL: insertps_8:
   1076 ; X64:       ## BB#0: ## %entry
   1077 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
   1078 ; X64-NEXT:    retq
   1079 entry:
   1080   %vecext = extractelement <4 x float> %A, i32 0
   1081   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   1082   %vecext1 = extractelement <4 x float> %B, i32 0
   1083   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
   1084   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
   1085   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   1086   ret <4 x float> %vecinit4
   1087 }
   1088 
   1089 define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
   1090 ; X32-LABEL: insertps_9:
   1091 ; X32:       ## BB#0: ## %entry
   1092 ; X32-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
   1093 ; X32-NEXT:    movaps %xmm1, %xmm0
   1094 ; X32-NEXT:    retl
   1095 ;
   1096 ; X64-LABEL: insertps_9:
   1097 ; X64:       ## BB#0: ## %entry
   1098 ; X64-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
   1099 ; X64-NEXT:    movaps %xmm1, %xmm0
   1100 ; X64-NEXT:    retq
   1101 entry:
   1102   %vecext = extractelement <4 x float> %A, i32 0
   1103   %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
   1104   %vecext1 = extractelement <4 x float> %B, i32 2
   1105   %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
   1106   %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
   1107   ret <4 x float> %vecinit3
   1108 }
   1109 
   1110 define <4 x float> @insertps_10(<4 x float> %A)
   1111 ; X32-LABEL: insertps_10:
   1112 ; X32:       ## BB#0:
   1113 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
   1114 ; X32-NEXT:    retl
   1115 ;
   1116 ; X64-LABEL: insertps_10:
   1117 ; X64:       ## BB#0:
   1118 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
   1119 ; X64-NEXT:    retq
   1120 {
   1121   %vecext = extractelement <4 x float> %A, i32 0
   1122   %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
   1123   %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
   1124   ret <4 x float> %vecbuild2
   1125 }
   1126 
   1127 define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
   1128 ; X32-LABEL: build_vector_to_shuffle_1:
   1129 ; X32:       ## BB#0: ## %entry
   1130 ; X32-NEXT:    xorps %xmm1, %xmm1
   1131 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   1132 ; X32-NEXT:    retl
   1133 ;
   1134 ; X64-LABEL: build_vector_to_shuffle_1:
   1135 ; X64:       ## BB#0: ## %entry
   1136 ; X64-NEXT:    xorps %xmm1, %xmm1
   1137 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   1138 ; X64-NEXT:    retq
   1139 entry:
   1140   %vecext = extractelement <4 x float> %A, i32 1
   1141   %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
   1142   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
   1143   %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1144   ret <4 x float> %vecinit3
   1145 }
   1146 
   1147 define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
   1148 ; X32-LABEL: build_vector_to_shuffle_2:
   1149 ; X32:       ## BB#0: ## %entry
   1150 ; X32-NEXT:    xorps %xmm1, %xmm1
   1151 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1152 ; X32-NEXT:    retl
   1153 ;
   1154 ; X64-LABEL: build_vector_to_shuffle_2:
   1155 ; X64:       ## BB#0: ## %entry
   1156 ; X64-NEXT:    xorps %xmm1, %xmm1
   1157 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1158 ; X64-NEXT:    retq
   1159 entry:
   1160   %vecext = extractelement <4 x float> %A, i32 1
   1161   %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
   1162   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
   1163   ret <4 x float> %vecinit1
   1164 }
   1165