Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      4 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      5 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      6 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
      7 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
      8 ;
      9 ; Verify that the DAG combiner correctly folds bitwise operations across
     10 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
     11 ; basic and always-safe patterns. Also test that the DAG combiner will combine
     12 ; target-specific shuffle instructions where reasonable.
     13 
     14 target triple = "x86_64-unknown-unknown"
     15 
     16 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
     17 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
     18 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
     19 
     20 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
     21 ; ALL-LABEL: combine_pshufd1:
     22 ; ALL:       # %bb.0: # %entry
     23 ; ALL-NEXT:    retq
     24 entry:
     25   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     26   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
     27   ret <4 x i32> %c
     28 }
     29 
     30 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
     31 ; ALL-LABEL: combine_pshufd2:
     32 ; ALL:       # %bb.0: # %entry
     33 ; ALL-NEXT:    retq
     34 entry:
     35   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     36   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     37   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
     38   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     39   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
     40   ret <4 x i32> %d
     41 }
     42 
     43 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
     44 ; ALL-LABEL: combine_pshufd3:
     45 ; ALL:       # %bb.0: # %entry
     46 ; ALL-NEXT:    retq
     47 entry:
     48   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     49   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     50   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
     51   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     52   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
     53   ret <4 x i32> %d
     54 }
     55 
     56 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
     57 ; SSE-LABEL: combine_pshufd4:
     58 ; SSE:       # %bb.0: # %entry
     59 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
     60 ; SSE-NEXT:    retq
     61 ;
     62 ; AVX-LABEL: combine_pshufd4:
     63 ; AVX:       # %bb.0: # %entry
     64 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
     65 ; AVX-NEXT:    retq
     66 entry:
     67   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
     68   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     69   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
     70   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     71   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
     72   ret <4 x i32> %d
     73 }
     74 
     75 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
     76 ; SSE-LABEL: combine_pshufd5:
     77 ; SSE:       # %bb.0: # %entry
     78 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
     79 ; SSE-NEXT:    retq
     80 ;
     81 ; AVX-LABEL: combine_pshufd5:
     82 ; AVX:       # %bb.0: # %entry
     83 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
     84 ; AVX-NEXT:    retq
     85 entry:
     86   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
     87   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     88   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
     89   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     90   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
     91   ret <4 x i32> %d
     92 }
     93 
     94 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
     95 ; SSE-LABEL: combine_pshufd6:
     96 ; SSE:       # %bb.0: # %entry
     97 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
     98 ; SSE-NEXT:    retq
     99 ;
    100 ; AVX1-LABEL: combine_pshufd6:
    101 ; AVX1:       # %bb.0: # %entry
    102 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
    103 ; AVX1-NEXT:    retq
    104 ;
    105 ; AVX2-LABEL: combine_pshufd6:
    106 ; AVX2:       # %bb.0: # %entry
    107 ; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
    108 ; AVX2-NEXT:    retq
    109 entry:
    110   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
    111   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
    112   ret <4 x i32> %c
    113 }
    114 
    115 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
    116 ; ALL-LABEL: combine_pshuflw1:
    117 ; ALL:       # %bb.0: # %entry
    118 ; ALL-NEXT:    retq
    119 entry:
    120   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    121   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
    122   ret <8 x i16> %c
    123 }
    124 
    125 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
    126 ; ALL-LABEL: combine_pshuflw2:
    127 ; ALL:       # %bb.0: # %entry
    128 ; ALL-NEXT:    retq
    129 entry:
    130   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    131   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
    132   %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
    133   ret <8 x i16> %d
    134 }
    135 
    136 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
    137 ; SSE-LABEL: combine_pshuflw3:
    138 ; SSE:       # %bb.0: # %entry
    139 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    140 ; SSE-NEXT:    retq
    141 ;
    142 ; AVX-LABEL: combine_pshuflw3:
    143 ; AVX:       # %bb.0: # %entry
    144 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    145 ; AVX-NEXT:    retq
    146 entry:
    147   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    148   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
    149   %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
    150   ret <8 x i16> %d
    151 }
    152 
    153 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
    154 ; SSE-LABEL: combine_pshufhw1:
    155 ; SSE:       # %bb.0: # %entry
    156 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    157 ; SSE-NEXT:    retq
    158 ;
    159 ; AVX-LABEL: combine_pshufhw1:
    160 ; AVX:       # %bb.0: # %entry
    161 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    162 ; AVX-NEXT:    retq
    163 entry:
    164   %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
    165   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
    166   %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
    167   ret <8 x i16> %d
    168 }
    169 
    170 define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    171 ; SSE-LABEL: combine_bitwise_ops_test1:
    172 ; SSE:       # %bb.0:
    173 ; SSE-NEXT:    pand %xmm1, %xmm0
    174 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    175 ; SSE-NEXT:    retq
    176 ;
    177 ; AVX-LABEL: combine_bitwise_ops_test1:
    178 ; AVX:       # %bb.0:
    179 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
    180 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    181 ; AVX-NEXT:    retq
    182   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    183   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    184   %and = and <4 x i32> %shuf1, %shuf2
    185   ret <4 x i32> %and
    186 }
    187 
    188 define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    189 ; SSE-LABEL: combine_bitwise_ops_test2:
    190 ; SSE:       # %bb.0:
    191 ; SSE-NEXT:    por %xmm1, %xmm0
    192 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    193 ; SSE-NEXT:    retq
    194 ;
    195 ; AVX-LABEL: combine_bitwise_ops_test2:
    196 ; AVX:       # %bb.0:
    197 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
    198 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    199 ; AVX-NEXT:    retq
    200   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    201   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    202   %or = or <4 x i32> %shuf1, %shuf2
    203   ret <4 x i32> %or
    204 }
    205 
    206 define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    207 ; SSE-LABEL: combine_bitwise_ops_test3:
    208 ; SSE:       # %bb.0:
    209 ; SSE-NEXT:    pxor %xmm1, %xmm0
    210 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    211 ; SSE-NEXT:    retq
    212 ;
    213 ; AVX-LABEL: combine_bitwise_ops_test3:
    214 ; AVX:       # %bb.0:
    215 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
    216 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    217 ; AVX-NEXT:    retq
    218   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    219   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    220   %xor = xor <4 x i32> %shuf1, %shuf2
    221   ret <4 x i32> %xor
    222 }
    223 
    224 define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    225 ; SSE-LABEL: combine_bitwise_ops_test4:
    226 ; SSE:       # %bb.0:
    227 ; SSE-NEXT:    pand %xmm1, %xmm0
    228 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    229 ; SSE-NEXT:    retq
    230 ;
    231 ; AVX-LABEL: combine_bitwise_ops_test4:
    232 ; AVX:       # %bb.0:
    233 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
    234 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    235 ; AVX-NEXT:    retq
    236   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    237   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    238   %and = and <4 x i32> %shuf1, %shuf2
    239   ret <4 x i32> %and
    240 }
    241 
    242 define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    243 ; SSE-LABEL: combine_bitwise_ops_test5:
    244 ; SSE:       # %bb.0:
    245 ; SSE-NEXT:    por %xmm1, %xmm0
    246 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    247 ; SSE-NEXT:    retq
    248 ;
    249 ; AVX-LABEL: combine_bitwise_ops_test5:
    250 ; AVX:       # %bb.0:
    251 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
    252 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    253 ; AVX-NEXT:    retq
    254   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    255   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    256   %or = or <4 x i32> %shuf1, %shuf2
    257   ret <4 x i32> %or
    258 }
    259 
    260 define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    261 ; SSE-LABEL: combine_bitwise_ops_test6:
    262 ; SSE:       # %bb.0:
    263 ; SSE-NEXT:    pxor %xmm1, %xmm0
    264 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    265 ; SSE-NEXT:    retq
    266 ;
    267 ; AVX-LABEL: combine_bitwise_ops_test6:
    268 ; AVX:       # %bb.0:
    269 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
    270 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    271 ; AVX-NEXT:    retq
    272   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    273   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    274   %xor = xor <4 x i32> %shuf1, %shuf2
    275   ret <4 x i32> %xor
    276 }
    277 
    278 
    279 ; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
    280 ; are not performing a swizzle operations.
    281 
    282 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    283 ; SSE2-LABEL: combine_bitwise_ops_test1b:
    284 ; SSE2:       # %bb.0:
    285 ; SSE2-NEXT:    pand %xmm1, %xmm0
    286 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    287 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    288 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    289 ; SSE2-NEXT:    retq
    290 ;
    291 ; SSSE3-LABEL: combine_bitwise_ops_test1b:
    292 ; SSSE3:       # %bb.0:
    293 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    294 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    295 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    296 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    297 ; SSSE3-NEXT:    retq
    298 ;
    299 ; SSE41-LABEL: combine_bitwise_ops_test1b:
    300 ; SSE41:       # %bb.0:
    301 ; SSE41-NEXT:    andps %xmm1, %xmm0
    302 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    303 ; SSE41-NEXT:    retq
    304 ;
    305 ; AVX-LABEL: combine_bitwise_ops_test1b:
    306 ; AVX:       # %bb.0:
    307 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
    308 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    309 ; AVX-NEXT:    retq
    310   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    311   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    312   %and = and <4 x i32> %shuf1, %shuf2
    313   ret <4 x i32> %and
    314 }
    315 
    316 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    317 ; SSE2-LABEL: combine_bitwise_ops_test2b:
    318 ; SSE2:       # %bb.0:
    319 ; SSE2-NEXT:    por %xmm1, %xmm0
    320 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    321 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    322 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    323 ; SSE2-NEXT:    retq
    324 ;
    325 ; SSSE3-LABEL: combine_bitwise_ops_test2b:
    326 ; SSSE3:       # %bb.0:
    327 ; SSSE3-NEXT:    por %xmm1, %xmm0
    328 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    329 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    330 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    331 ; SSSE3-NEXT:    retq
    332 ;
    333 ; SSE41-LABEL: combine_bitwise_ops_test2b:
    334 ; SSE41:       # %bb.0:
    335 ; SSE41-NEXT:    orps %xmm1, %xmm0
    336 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    337 ; SSE41-NEXT:    retq
    338 ;
    339 ; AVX-LABEL: combine_bitwise_ops_test2b:
    340 ; AVX:       # %bb.0:
    341 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
    342 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    343 ; AVX-NEXT:    retq
    344   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    345   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    346   %or = or <4 x i32> %shuf1, %shuf2
    347   ret <4 x i32> %or
    348 }
    349 
    350 define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    351 ; SSE2-LABEL: combine_bitwise_ops_test3b:
    352 ; SSE2:       # %bb.0:
    353 ; SSE2-NEXT:    xorps %xmm1, %xmm0
    354 ; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
    355 ; SSE2-NEXT:    retq
    356 ;
    357 ; SSSE3-LABEL: combine_bitwise_ops_test3b:
    358 ; SSSE3:       # %bb.0:
    359 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
    360 ; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
    361 ; SSSE3-NEXT:    retq
    362 ;
    363 ; SSE41-LABEL: combine_bitwise_ops_test3b:
    364 ; SSE41:       # %bb.0:
    365 ; SSE41-NEXT:    xorps %xmm1, %xmm0
    366 ; SSE41-NEXT:    xorps %xmm1, %xmm1
    367 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    368 ; SSE41-NEXT:    retq
    369 ;
    370 ; AVX-LABEL: combine_bitwise_ops_test3b:
    371 ; AVX:       # %bb.0:
    372 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
    373 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    374 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    375 ; AVX-NEXT:    retq
    376   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    377   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    378   %xor = xor <4 x i32> %shuf1, %shuf2
    379   ret <4 x i32> %xor
    380 }
    381 
    382 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    383 ; SSE2-LABEL: combine_bitwise_ops_test4b:
    384 ; SSE2:       # %bb.0:
    385 ; SSE2-NEXT:    pand %xmm1, %xmm0
    386 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    387 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    388 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    389 ; SSE2-NEXT:    retq
    390 ;
    391 ; SSSE3-LABEL: combine_bitwise_ops_test4b:
    392 ; SSSE3:       # %bb.0:
    393 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    394 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    395 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    396 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    397 ; SSSE3-NEXT:    retq
    398 ;
    399 ; SSE41-LABEL: combine_bitwise_ops_test4b:
    400 ; SSE41:       # %bb.0:
    401 ; SSE41-NEXT:    andps %xmm1, %xmm0
    402 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    403 ; SSE41-NEXT:    retq
    404 ;
    405 ; AVX-LABEL: combine_bitwise_ops_test4b:
    406 ; AVX:       # %bb.0:
    407 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
    408 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    409 ; AVX-NEXT:    retq
    410   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    411   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    412   %and = and <4 x i32> %shuf1, %shuf2
    413   ret <4 x i32> %and
    414 }
    415 
    416 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    417 ; SSE2-LABEL: combine_bitwise_ops_test5b:
    418 ; SSE2:       # %bb.0:
    419 ; SSE2-NEXT:    por %xmm1, %xmm0
    420 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    421 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    422 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    423 ; SSE2-NEXT:    retq
    424 ;
    425 ; SSSE3-LABEL: combine_bitwise_ops_test5b:
    426 ; SSSE3:       # %bb.0:
    427 ; SSSE3-NEXT:    por %xmm1, %xmm0
    428 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    429 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    430 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    431 ; SSSE3-NEXT:    retq
    432 ;
    433 ; SSE41-LABEL: combine_bitwise_ops_test5b:
    434 ; SSE41:       # %bb.0:
    435 ; SSE41-NEXT:    orps %xmm1, %xmm0
    436 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    437 ; SSE41-NEXT:    retq
    438 ;
    439 ; AVX-LABEL: combine_bitwise_ops_test5b:
    440 ; AVX:       # %bb.0:
    441 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
    442 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    443 ; AVX-NEXT:    retq
    444   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    445   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    446   %or = or <4 x i32> %shuf1, %shuf2
    447   ret <4 x i32> %or
    448 }
    449 
    450 define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    451 ; SSE2-LABEL: combine_bitwise_ops_test6b:
    452 ; SSE2:       # %bb.0:
    453 ; SSE2-NEXT:    xorps %xmm1, %xmm0
    454 ; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
    455 ; SSE2-NEXT:    retq
    456 ;
    457 ; SSSE3-LABEL: combine_bitwise_ops_test6b:
    458 ; SSSE3:       # %bb.0:
    459 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
    460 ; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
    461 ; SSSE3-NEXT:    retq
    462 ;
    463 ; SSE41-LABEL: combine_bitwise_ops_test6b:
    464 ; SSE41:       # %bb.0:
    465 ; SSE41-NEXT:    xorps %xmm1, %xmm0
    466 ; SSE41-NEXT:    xorps %xmm1, %xmm1
    467 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
    468 ; SSE41-NEXT:    retq
    469 ;
    470 ; AVX-LABEL: combine_bitwise_ops_test6b:
    471 ; AVX:       # %bb.0:
    472 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
    473 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    474 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
    475 ; AVX-NEXT:    retq
    476   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    477   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    478   %xor = xor <4 x i32> %shuf1, %shuf2
    479   ret <4 x i32> %xor
    480 }
    481 
    482 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    483 ; SSE-LABEL: combine_bitwise_ops_test1c:
    484 ; SSE:       # %bb.0:
    485 ; SSE-NEXT:    andps %xmm1, %xmm0
    486 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
    487 ; SSE-NEXT:    retq
    488 ;
    489 ; AVX-LABEL: combine_bitwise_ops_test1c:
    490 ; AVX:       # %bb.0:
    491 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
    492 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
    493 ; AVX-NEXT:    retq
    494   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    495   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    496   %and = and <4 x i32> %shuf1, %shuf2
    497   ret <4 x i32> %and
    498 }
    499 
    500 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    501 ; SSE-LABEL: combine_bitwise_ops_test2c:
    502 ; SSE:       # %bb.0:
    503 ; SSE-NEXT:    orps %xmm1, %xmm0
    504 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
    505 ; SSE-NEXT:    retq
    506 ;
    507 ; AVX-LABEL: combine_bitwise_ops_test2c:
    508 ; AVX:       # %bb.0:
    509 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
    510 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
    511 ; AVX-NEXT:    retq
    512   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    513   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    514   %or = or <4 x i32> %shuf1, %shuf2
    515   ret <4 x i32> %or
    516 }
    517 
    518 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    519 ; SSE2-LABEL: combine_bitwise_ops_test3c:
    520 ; SSE2:       # %bb.0:
    521 ; SSE2-NEXT:    xorps %xmm1, %xmm0
    522 ; SSE2-NEXT:    xorps %xmm1, %xmm1
    523 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
    524 ; SSE2-NEXT:    retq
    525 ;
    526 ; SSSE3-LABEL: combine_bitwise_ops_test3c:
    527 ; SSSE3:       # %bb.0:
    528 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
    529 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
    530 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
    531 ; SSSE3-NEXT:    retq
    532 ;
    533 ; SSE41-LABEL: combine_bitwise_ops_test3c:
    534 ; SSE41:       # %bb.0:
    535 ; SSE41-NEXT:    xorps %xmm1, %xmm0
    536 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    537 ; SSE41-NEXT:    retq
    538 ;
    539 ; AVX-LABEL: combine_bitwise_ops_test3c:
    540 ; AVX:       # %bb.0:
    541 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
    542 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    543 ; AVX-NEXT:    retq
    544   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    545   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    546   %xor = xor <4 x i32> %shuf1, %shuf2
    547   ret <4 x i32> %xor
    548 }
    549 
    550 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    551 ; SSE-LABEL: combine_bitwise_ops_test4c:
    552 ; SSE:       # %bb.0:
    553 ; SSE-NEXT:    andps %xmm1, %xmm0
    554 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
    555 ; SSE-NEXT:    movaps %xmm2, %xmm0
    556 ; SSE-NEXT:    retq
    557 ;
    558 ; AVX-LABEL: combine_bitwise_ops_test4c:
    559 ; AVX:       # %bb.0:
    560 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
    561 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
    562 ; AVX-NEXT:    retq
    563   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    564   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    565   %and = and <4 x i32> %shuf1, %shuf2
    566   ret <4 x i32> %and
    567 }
    568 
    569 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    570 ; SSE-LABEL: combine_bitwise_ops_test5c:
    571 ; SSE:       # %bb.0:
    572 ; SSE-NEXT:    orps %xmm1, %xmm0
    573 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
    574 ; SSE-NEXT:    movaps %xmm2, %xmm0
    575 ; SSE-NEXT:    retq
    576 ;
    577 ; AVX-LABEL: combine_bitwise_ops_test5c:
    578 ; AVX:       # %bb.0:
    579 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
    580 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
    581 ; AVX-NEXT:    retq
    582   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    583   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    584   %or = or <4 x i32> %shuf1, %shuf2
    585   ret <4 x i32> %or
    586 }
    587 
    588 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    589 ; SSE2-LABEL: combine_bitwise_ops_test6c:
    590 ; SSE2:       # %bb.0:
    591 ; SSE2-NEXT:    xorps %xmm1, %xmm0
    592 ; SSE2-NEXT:    xorps %xmm1, %xmm1
    593 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
    594 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    595 ; SSE2-NEXT:    retq
    596 ;
    597 ; SSSE3-LABEL: combine_bitwise_ops_test6c:
    598 ; SSSE3:       # %bb.0:
    599 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
    600 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
    601 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
    602 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
    603 ; SSSE3-NEXT:    retq
    604 ;
    605 ; SSE41-LABEL: combine_bitwise_ops_test6c:
    606 ; SSE41:       # %bb.0:
    607 ; SSE41-NEXT:    xorps %xmm1, %xmm0
    608 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
    609 ; SSE41-NEXT:    retq
    610 ;
    611 ; AVX-LABEL: combine_bitwise_ops_test6c:
    612 ; AVX:       # %bb.0:
    613 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
    614 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
    615 ; AVX-NEXT:    retq
    616   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    617   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    618   %xor = xor <4 x i32> %shuf1, %shuf2
    619   ret <4 x i32> %xor
    620 }
    621 
    622 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
    623 ; SSE-LABEL: combine_nested_undef_test1:
    624 ; SSE:       # %bb.0:
    625 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
    626 ; SSE-NEXT:    retq
    627 ;
    628 ; AVX-LABEL: combine_nested_undef_test1:
    629 ; AVX:       # %bb.0:
    630 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
    631 ; AVX-NEXT:    retq
    632   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
    633   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    634   ret <4 x i32> %2
    635 }
    636 
    637 define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
    638 ; SSE-LABEL: combine_nested_undef_test2:
    639 ; SSE:       # %bb.0:
    640 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    641 ; SSE-NEXT:    retq
    642 ;
    643 ; AVX-LABEL: combine_nested_undef_test2:
    644 ; AVX:       # %bb.0:
    645 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
    646 ; AVX-NEXT:    retq
    647   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
    648   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    649   ret <4 x i32> %2
    650 }
    651 
    652 define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
    653 ; SSE-LABEL: combine_nested_undef_test3:
    654 ; SSE:       # %bb.0:
    655 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    656 ; SSE-NEXT:    retq
    657 ;
    658 ; AVX-LABEL: combine_nested_undef_test3:
    659 ; AVX:       # %bb.0:
    660 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
    661 ; AVX-NEXT:    retq
    662   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
    663   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    664   ret <4 x i32> %2
    665 }
    666 
    667 define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
    668 ; SSE-LABEL: combine_nested_undef_test4:
    669 ; SSE:       # %bb.0:
    670 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    671 ; SSE-NEXT:    retq
    672 ;
    673 ; AVX1-LABEL: combine_nested_undef_test4:
    674 ; AVX1:       # %bb.0:
    675 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
    676 ; AVX1-NEXT:    retq
    677 ;
    678 ; AVX2-LABEL: combine_nested_undef_test4:
    679 ; AVX2:       # %bb.0:
    680 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
    681 ; AVX2-NEXT:    retq
    682   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
    683   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
    684   ret <4 x i32> %2
    685 }
    686 
    687 define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
    688 ; SSE-LABEL: combine_nested_undef_test5:
    689 ; SSE:       # %bb.0:
    690 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
    691 ; SSE-NEXT:    retq
    692 ;
    693 ; AVX-LABEL: combine_nested_undef_test5:
    694 ; AVX:       # %bb.0:
    695 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
    696 ; AVX-NEXT:    retq
    697   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
    698   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
    699   ret <4 x i32> %2
    700 }
    701 
    702 define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
    703 ; SSE-LABEL: combine_nested_undef_test6:
    704 ; SSE:       # %bb.0:
    705 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    706 ; SSE-NEXT:    retq
    707 ;
    708 ; AVX-LABEL: combine_nested_undef_test6:
    709 ; AVX:       # %bb.0:
    710 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
    711 ; AVX-NEXT:    retq
    712   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
    713   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
    714   ret <4 x i32> %2
    715 }
    716 
    717 define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
    718 ; SSE-LABEL: combine_nested_undef_test7:
    719 ; SSE:       # %bb.0:
    720 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
    721 ; SSE-NEXT:    retq
    722 ;
    723 ; AVX-LABEL: combine_nested_undef_test7:
    724 ; AVX:       # %bb.0:
    725 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
    726 ; AVX-NEXT:    retq
    727   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    728   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
    729   ret <4 x i32> %2
    730 }
    731 
    732 define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
    733 ; SSE-LABEL: combine_nested_undef_test8:
    734 ; SSE:       # %bb.0:
    735 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
    736 ; SSE-NEXT:    retq
    737 ;
    738 ; AVX-LABEL: combine_nested_undef_test8:
    739 ; AVX:       # %bb.0:
    740 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
    741 ; AVX-NEXT:    retq
    742   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
    743   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
    744   ret <4 x i32> %2
    745 }
    746 
    747 define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
    748 ; SSE-LABEL: combine_nested_undef_test9:
    749 ; SSE:       # %bb.0:
    750 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
    751 ; SSE-NEXT:    retq
    752 ;
    753 ; AVX-LABEL: combine_nested_undef_test9:
    754 ; AVX:       # %bb.0:
    755 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2]
    756 ; AVX-NEXT:    retq
    757   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
    758   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
    759   ret <4 x i32> %2
    760 }
    761 
    762 define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
    763 ; SSE-LABEL: combine_nested_undef_test10:
    764 ; SSE:       # %bb.0:
    765 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
    766 ; SSE-NEXT:    retq
    767 ;
    768 ; AVX-LABEL: combine_nested_undef_test10:
    769 ; AVX:       # %bb.0:
    770 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,3]
    771 ; AVX-NEXT:    retq
    772   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
    773   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
    774   ret <4 x i32> %2
    775 }
    776 
    777 define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
    778 ; SSE-LABEL: combine_nested_undef_test11:
    779 ; SSE:       # %bb.0:
    780 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
    781 ; SSE-NEXT:    retq
    782 ;
    783 ; AVX-LABEL: combine_nested_undef_test11:
    784 ; AVX:       # %bb.0:
    785 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1]
    786 ; AVX-NEXT:    retq
    787   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
    788   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
    789   ret <4 x i32> %2
    790 }
    791 
    792 define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
    793 ; SSE-LABEL: combine_nested_undef_test12:
    794 ; SSE:       # %bb.0:
    795 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    796 ; SSE-NEXT:    retq
    797 ;
    798 ; AVX1-LABEL: combine_nested_undef_test12:
    799 ; AVX1:       # %bb.0:
    800 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
    801 ; AVX1-NEXT:    retq
    802 ;
    803 ; AVX2-LABEL: combine_nested_undef_test12:
    804 ; AVX2:       # %bb.0:
    805 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
    806 ; AVX2-NEXT:    retq
    807   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
    808   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
    809   ret <4 x i32> %2
    810 }
    811 
    812 ; The following pair of shuffles is folded into vector %A.
    813 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
    814 ; ALL-LABEL: combine_nested_undef_test13:
    815 ; ALL:       # %bb.0:
    816 ; ALL-NEXT:    retq
    817   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
    818   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
    819   ret <4 x i32> %2
    820 }
    821 
    822 ; The following pair of shuffles is folded into vector %B.
    823 define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
    824 ; SSE-LABEL: combine_nested_undef_test14:
    825 ; SSE:       # %bb.0:
    826 ; SSE-NEXT:    movaps %xmm1, %xmm0
    827 ; SSE-NEXT:    retq
    828 ;
    829 ; AVX-LABEL: combine_nested_undef_test14:
    830 ; AVX:       # %bb.0:
    831 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
    832 ; AVX-NEXT:    retq
    833   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
    834   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
    835   ret <4 x i32> %2
    836 }
    837 
    838 
    839 ; Verify that we don't optimize the following cases. We expect more than one shuffle.
    840 ;
    841 ; FIXME: Many of these already don't make sense, and the rest should stop
    842 ; making sense with th enew vector shuffle lowering. Revisit at least testing for
    843 ; it.
    844 
    845 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
    846 ; SSE2-LABEL: combine_nested_undef_test15:
    847 ; SSE2:       # %bb.0:
    848 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
    849 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
    850 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    851 ; SSE2-NEXT:    retq
    852 ;
    853 ; SSSE3-LABEL: combine_nested_undef_test15:
    854 ; SSSE3:       # %bb.0:
    855 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
    856 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
    857 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
    858 ; SSSE3-NEXT:    retq
    859 ;
    860 ; SSE41-LABEL: combine_nested_undef_test15:
    861 ; SSE41:       # %bb.0:
    862 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
    863 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
    864 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    865 ; SSE41-NEXT:    retq
    866 ;
    867 ; AVX1-LABEL: combine_nested_undef_test15:
    868 ; AVX1:       # %bb.0:
    869 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
    870 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
    871 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    872 ; AVX1-NEXT:    retq
    873 ;
    874 ; AVX2-LABEL: combine_nested_undef_test15:
    875 ; AVX2:       # %bb.0:
    876 ; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
    877 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
    878 ; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    879 ; AVX2-NEXT:    retq
    880   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
    881   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
    882   ret <4 x i32> %2
    883 }
    884 
    885 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
    886 ; SSE2-LABEL: combine_nested_undef_test16:
    887 ; SSE2:       # %bb.0:
    888 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
    889 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
    890 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    891 ; SSE2-NEXT:    retq
    892 ;
    893 ; SSSE3-LABEL: combine_nested_undef_test16:
    894 ; SSSE3:       # %bb.0:
    895 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
    896 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
    897 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    898 ; SSSE3-NEXT:    retq
    899 ;
    900 ; SSE41-LABEL: combine_nested_undef_test16:
    901 ; SSE41:       # %bb.0:
    902 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    903 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    904 ; SSE41-NEXT:    retq
    905 ;
    906 ; AVX-LABEL: combine_nested_undef_test16:
    907 ; AVX:       # %bb.0:
    908 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
    909 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    910 ; AVX-NEXT:    retq
    911   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    912   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
    913   ret <4 x i32> %2
    914 }
    915 
    916 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
    917 ; SSE2-LABEL: combine_nested_undef_test17:
    918 ; SSE2:       # %bb.0:
    919 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
    920 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
    921 ; SSE2-NEXT:    retq
    922 ;
    923 ; SSSE3-LABEL: combine_nested_undef_test17:
    924 ; SSSE3:       # %bb.0:
    925 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
    926 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
    927 ; SSSE3-NEXT:    retq
    928 ;
    929 ; SSE41-LABEL: combine_nested_undef_test17:
    930 ; SSE41:       # %bb.0:
    931 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
    932 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
    933 ; SSE41-NEXT:    retq
    934 ;
    935 ; AVX-LABEL: combine_nested_undef_test17:
    936 ; AVX:       # %bb.0:
    937 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    938 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
    939 ; AVX-NEXT:    retq
    940   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
    941   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
    942   ret <4 x i32> %2
    943 }
    944 
    945 define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
    946 ; SSE-LABEL: combine_nested_undef_test18:
    947 ; SSE:       # %bb.0:
    948 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
    949 ; SSE-NEXT:    retq
    950 ;
    951 ; AVX-LABEL: combine_nested_undef_test18:
    952 ; AVX:       # %bb.0:
    953 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3]
    954 ; AVX-NEXT:    retq
    955   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
    956   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
    957   ret <4 x i32> %2
    958 }
    959 
    960 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
    961 ; SSE2-LABEL: combine_nested_undef_test19:
    962 ; SSE2:       # %bb.0:
    963 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    964 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
    965 ; SSE2-NEXT:    retq
    966 ;
    967 ; SSSE3-LABEL: combine_nested_undef_test19:
    968 ; SSSE3:       # %bb.0:
    969 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    970 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
    971 ; SSSE3-NEXT:    retq
    972 ;
    973 ; SSE41-LABEL: combine_nested_undef_test19:
    974 ; SSE41:       # %bb.0:
    975 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
    976 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
    977 ; SSE41-NEXT:    retq
    978 ;
    979 ; AVX-LABEL: combine_nested_undef_test19:
    980 ; AVX:       # %bb.0:
    981 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    982 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
    983 ; AVX-NEXT:    retq
    984   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
    985   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
    986   ret <4 x i32> %2
    987 }
    988 
    989 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
    990 ; SSE2-LABEL: combine_nested_undef_test20:
    991 ; SSE2:       # %bb.0:
    992 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
    993 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
    994 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    995 ; SSE2-NEXT:    retq
    996 ;
    997 ; SSSE3-LABEL: combine_nested_undef_test20:
    998 ; SSSE3:       # %bb.0:
    999 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
   1000 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
   1001 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1002 ; SSSE3-NEXT:    retq
   1003 ;
   1004 ; SSE41-LABEL: combine_nested_undef_test20:
   1005 ; SSE41:       # %bb.0:
   1006 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
   1007 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
   1008 ; SSE41-NEXT:    retq
   1009 ;
   1010 ; AVX-LABEL: combine_nested_undef_test20:
   1011 ; AVX:       # %bb.0:
   1012 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   1013 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0]
   1014 ; AVX-NEXT:    retq
   1015   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
   1016   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1017   ret <4 x i32> %2
   1018 }
   1019 
   1020 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
   1021 ; SSE2-LABEL: combine_nested_undef_test21:
   1022 ; SSE2:       # %bb.0:
   1023 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1024 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
   1025 ; SSE2-NEXT:    retq
   1026 ;
   1027 ; SSSE3-LABEL: combine_nested_undef_test21:
   1028 ; SSSE3:       # %bb.0:
   1029 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1030 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
   1031 ; SSSE3-NEXT:    retq
   1032 ;
   1033 ; SSE41-LABEL: combine_nested_undef_test21:
   1034 ; SSE41:       # %bb.0:
   1035 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1036 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1037 ; SSE41-NEXT:    retq
   1038 ;
   1039 ; AVX1-LABEL: combine_nested_undef_test21:
   1040 ; AVX1:       # %bb.0:
   1041 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1042 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1043 ; AVX1-NEXT:    retq
   1044 ;
   1045 ; AVX2-LABEL: combine_nested_undef_test21:
   1046 ; AVX2:       # %bb.0:
   1047 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1048 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1049 ; AVX2-NEXT:    retq
   1050   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
   1051   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
   1052   ret <4 x i32> %2
   1053 }
   1054 
   1055 
   1056 ; Test that we correctly combine shuffles according to rule
   1057 ;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
   1058 
   1059 define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
   1060 ; SSE-LABEL: combine_nested_undef_test22:
   1061 ; SSE:       # %bb.0:
   1062 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
   1063 ; SSE-NEXT:    retq
   1064 ;
   1065 ; AVX-LABEL: combine_nested_undef_test22:
   1066 ; AVX:       # %bb.0:
   1067 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3]
   1068 ; AVX-NEXT:    retq
   1069   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
   1070   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
   1071   ret <4 x i32> %2
   1072 }
   1073 
   1074 define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
   1075 ; SSE-LABEL: combine_nested_undef_test23:
   1076 ; SSE:       # %bb.0:
   1077 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
   1078 ; SSE-NEXT:    retq
   1079 ;
   1080 ; AVX-LABEL: combine_nested_undef_test23:
   1081 ; AVX:       # %bb.0:
   1082 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3]
   1083 ; AVX-NEXT:    retq
   1084   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
   1085   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
   1086   ret <4 x i32> %2
   1087 }
   1088 
   1089 define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
   1090 ; SSE-LABEL: combine_nested_undef_test24:
   1091 ; SSE:       # %bb.0:
   1092 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
   1093 ; SSE-NEXT:    retq
   1094 ;
   1095 ; AVX-LABEL: combine_nested_undef_test24:
   1096 ; AVX:       # %bb.0:
   1097 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3]
   1098 ; AVX-NEXT:    retq
   1099   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1100   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
   1101   ret <4 x i32> %2
   1102 }
   1103 
   1104 define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
   1105 ; SSE-LABEL: combine_nested_undef_test25:
   1106 ; SSE:       # %bb.0:
   1107 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1108 ; SSE-NEXT:    retq
   1109 ;
   1110 ; AVX1-LABEL: combine_nested_undef_test25:
   1111 ; AVX1:       # %bb.0:
   1112 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1113 ; AVX1-NEXT:    retq
   1114 ;
   1115 ; AVX2-LABEL: combine_nested_undef_test25:
   1116 ; AVX2:       # %bb.0:
   1117 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1118 ; AVX2-NEXT:    retq
   1119   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
   1120   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
   1121   ret <4 x i32> %2
   1122 }
   1123 
   1124 define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
   1125 ; SSE-LABEL: combine_nested_undef_test26:
   1126 ; SSE:       # %bb.0:
   1127 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
   1128 ; SSE-NEXT:    retq
   1129 ;
   1130 ; AVX-LABEL: combine_nested_undef_test26:
   1131 ; AVX:       # %bb.0:
   1132 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
   1133 ; AVX-NEXT:    retq
   1134   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
   1135   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
   1136   ret <4 x i32> %2
   1137 }
   1138 
   1139 define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
   1140 ; SSE-LABEL: combine_nested_undef_test27:
   1141 ; SSE:       # %bb.0:
   1142 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1143 ; SSE-NEXT:    retq
   1144 ;
   1145 ; AVX1-LABEL: combine_nested_undef_test27:
   1146 ; AVX1:       # %bb.0:
   1147 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1148 ; AVX1-NEXT:    retq
   1149 ;
   1150 ; AVX2-LABEL: combine_nested_undef_test27:
   1151 ; AVX2:       # %bb.0:
   1152 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1153 ; AVX2-NEXT:    retq
   1154   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
   1155   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   1156   ret <4 x i32> %2
   1157 }
   1158 
   1159 define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
   1160 ; SSE-LABEL: combine_nested_undef_test28:
   1161 ; SSE:       # %bb.0:
   1162 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
   1163 ; SSE-NEXT:    retq
   1164 ;
   1165 ; AVX-LABEL: combine_nested_undef_test28:
   1166 ; AVX:       # %bb.0:
   1167 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
   1168 ; AVX-NEXT:    retq
   1169   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   1170   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
   1171   ret <4 x i32> %2
   1172 }
   1173 
   1174 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
   1175 ; SSE-LABEL: combine_test1:
   1176 ; SSE:       # %bb.0:
   1177 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1178 ; SSE-NEXT:    retq
   1179 ;
   1180 ; AVX-LABEL: combine_test1:
   1181 ; AVX:       # %bb.0:
   1182 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
   1183 ; AVX-NEXT:    retq
   1184   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1185   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1186   ret <4 x float> %2
   1187 }
   1188 
   1189 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
   1190 ; SSE2-LABEL: combine_test2:
   1191 ; SSE2:       # %bb.0:
   1192 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1193 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1194 ; SSE2-NEXT:    retq
   1195 ;
   1196 ; SSSE3-LABEL: combine_test2:
   1197 ; SSSE3:       # %bb.0:
   1198 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1199 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1200 ; SSSE3-NEXT:    retq
   1201 ;
   1202 ; SSE41-LABEL: combine_test2:
   1203 ; SSE41:       # %bb.0:
   1204 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1205 ; SSE41-NEXT:    retq
   1206 ;
   1207 ; AVX-LABEL: combine_test2:
   1208 ; AVX:       # %bb.0:
   1209 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1210 ; AVX-NEXT:    retq
   1211   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1212   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1213   ret <4 x float> %2
   1214 }
   1215 
   1216 define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
   1217 ; SSE-LABEL: combine_test3:
   1218 ; SSE:       # %bb.0:
   1219 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1220 ; SSE-NEXT:    retq
   1221 ;
   1222 ; AVX-LABEL: combine_test3:
   1223 ; AVX:       # %bb.0:
   1224 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1225 ; AVX-NEXT:    retq
   1226   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   1227   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1228   ret <4 x float> %2
   1229 }
   1230 
   1231 define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
   1232 ; SSE-LABEL: combine_test4:
   1233 ; SSE:       # %bb.0:
   1234 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1235 ; SSE-NEXT:    retq
   1236 ;
   1237 ; AVX-LABEL: combine_test4:
   1238 ; AVX:       # %bb.0:
   1239 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1240 ; AVX-NEXT:    retq
   1241   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   1242   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1243   ret <4 x float> %2
   1244 }
   1245 
   1246 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
   1247 ; SSE2-LABEL: combine_test5:
   1248 ; SSE2:       # %bb.0:
   1249 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1250 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1251 ; SSE2-NEXT:    retq
   1252 ;
   1253 ; SSSE3-LABEL: combine_test5:
   1254 ; SSSE3:       # %bb.0:
   1255 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1256 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1257 ; SSSE3-NEXT:    retq
   1258 ;
   1259 ; SSE41-LABEL: combine_test5:
   1260 ; SSE41:       # %bb.0:
   1261 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1262 ; SSE41-NEXT:    retq
   1263 ;
   1264 ; AVX-LABEL: combine_test5:
   1265 ; AVX:       # %bb.0:
   1266 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1267 ; AVX-NEXT:    retq
   1268   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1269   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1270   ret <4 x float> %2
   1271 }
   1272 
   1273 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
   1274 ; SSE-LABEL: combine_test6:
   1275 ; SSE:       # %bb.0:
   1276 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1277 ; SSE-NEXT:    retq
   1278 ;
   1279 ; AVX-LABEL: combine_test6:
   1280 ; AVX:       # %bb.0:
   1281 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
   1282 ; AVX-NEXT:    retq
   1283   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1284   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1285   ret <4 x i32> %2
   1286 }
   1287 
   1288 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
   1289 ; SSE2-LABEL: combine_test7:
   1290 ; SSE2:       # %bb.0:
   1291 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1292 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1293 ; SSE2-NEXT:    retq
   1294 ;
   1295 ; SSSE3-LABEL: combine_test7:
   1296 ; SSSE3:       # %bb.0:
   1297 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1298 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1299 ; SSSE3-NEXT:    retq
   1300 ;
   1301 ; SSE41-LABEL: combine_test7:
   1302 ; SSE41:       # %bb.0:
   1303 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1304 ; SSE41-NEXT:    retq
   1305 ;
   1306 ; AVX-LABEL: combine_test7:
   1307 ; AVX:       # %bb.0:
   1308 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1309 ; AVX-NEXT:    retq
   1310   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1311   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1312   ret <4 x i32> %2
   1313 }
   1314 
   1315 define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
   1316 ; SSE-LABEL: combine_test8:
   1317 ; SSE:       # %bb.0:
   1318 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1319 ; SSE-NEXT:    retq
   1320 ;
   1321 ; AVX-LABEL: combine_test8:
   1322 ; AVX:       # %bb.0:
   1323 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1324 ; AVX-NEXT:    retq
   1325   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   1326   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1327   ret <4 x i32> %2
   1328 }
   1329 
   1330 define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
   1331 ; SSE-LABEL: combine_test9:
   1332 ; SSE:       # %bb.0:
   1333 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   1334 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1335 ; SSE-NEXT:    retq
   1336 ;
   1337 ; AVX-LABEL: combine_test9:
   1338 ; AVX:       # %bb.0:
   1339 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1340 ; AVX-NEXT:    retq
   1341   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   1342   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1343   ret <4 x i32> %2
   1344 }
   1345 
   1346 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
   1347 ; SSE2-LABEL: combine_test10:
   1348 ; SSE2:       # %bb.0:
   1349 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1350 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1351 ; SSE2-NEXT:    retq
   1352 ;
   1353 ; SSSE3-LABEL: combine_test10:
   1354 ; SSSE3:       # %bb.0:
   1355 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1356 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1357 ; SSSE3-NEXT:    retq
   1358 ;
   1359 ; SSE41-LABEL: combine_test10:
   1360 ; SSE41:       # %bb.0:
   1361 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1362 ; SSE41-NEXT:    retq
   1363 ;
   1364 ; AVX-LABEL: combine_test10:
   1365 ; AVX:       # %bb.0:
   1366 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1367 ; AVX-NEXT:    retq
   1368   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1369   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1370   ret <4 x i32> %2
   1371 }
   1372 
   1373 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
   1374 ; ALL-LABEL: combine_test11:
   1375 ; ALL:       # %bb.0:
   1376 ; ALL-NEXT:    retq
   1377   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1378   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1379   ret <4 x float> %2
   1380 }
   1381 
   1382 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
   1383 ; SSE2-LABEL: combine_test12:
   1384 ; SSE2:       # %bb.0:
   1385 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1386 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1387 ; SSE2-NEXT:    retq
   1388 ;
   1389 ; SSSE3-LABEL: combine_test12:
   1390 ; SSSE3:       # %bb.0:
   1391 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1392 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1393 ; SSSE3-NEXT:    retq
   1394 ;
   1395 ; SSE41-LABEL: combine_test12:
   1396 ; SSE41:       # %bb.0:
   1397 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1398 ; SSE41-NEXT:    retq
   1399 ;
   1400 ; AVX-LABEL: combine_test12:
   1401 ; AVX:       # %bb.0:
   1402 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1403 ; AVX-NEXT:    retq
   1404   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1405   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   1406   ret <4 x float> %2
   1407 }
   1408 
   1409 define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
   1410 ; SSE-LABEL: combine_test13:
   1411 ; SSE:       # %bb.0:
   1412 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1413 ; SSE-NEXT:    retq
   1414 ;
   1415 ; AVX-LABEL: combine_test13:
   1416 ; AVX:       # %bb.0:
   1417 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1418 ; AVX-NEXT:    retq
   1419   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1420   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1421   ret <4 x float> %2
   1422 }
   1423 
   1424 define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
   1425 ; SSE-LABEL: combine_test14:
   1426 ; SSE:       # %bb.0:
   1427 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1428 ; SSE-NEXT:    retq
   1429 ;
   1430 ; AVX-LABEL: combine_test14:
   1431 ; AVX:       # %bb.0:
   1432 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1433 ; AVX-NEXT:    retq
   1434   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
   1435   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1436   ret <4 x float> %2
   1437 }
   1438 
   1439 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
   1440 ; SSE2-LABEL: combine_test15:
   1441 ; SSE2:       # %bb.0:
   1442 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1443 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1444 ; SSE2-NEXT:    retq
   1445 ;
   1446 ; SSSE3-LABEL: combine_test15:
   1447 ; SSSE3:       # %bb.0:
   1448 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1449 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1450 ; SSSE3-NEXT:    retq
   1451 ;
   1452 ; SSE41-LABEL: combine_test15:
   1453 ; SSE41:       # %bb.0:
   1454 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1455 ; SSE41-NEXT:    retq
   1456 ;
   1457 ; AVX-LABEL: combine_test15:
   1458 ; AVX:       # %bb.0:
   1459 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1460 ; AVX-NEXT:    retq
   1461   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1462   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   1463   ret <4 x float> %2
   1464 }
   1465 
   1466 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
   1467 ; ALL-LABEL: combine_test16:
   1468 ; ALL:       # %bb.0:
   1469 ; ALL-NEXT:    retq
   1470   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1471   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1472   ret <4 x i32> %2
   1473 }
   1474 
   1475 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
   1476 ; SSE2-LABEL: combine_test17:
   1477 ; SSE2:       # %bb.0:
   1478 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1479 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1480 ; SSE2-NEXT:    retq
   1481 ;
   1482 ; SSSE3-LABEL: combine_test17:
   1483 ; SSSE3:       # %bb.0:
   1484 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1485 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1486 ; SSSE3-NEXT:    retq
   1487 ;
   1488 ; SSE41-LABEL: combine_test17:
   1489 ; SSE41:       # %bb.0:
   1490 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1491 ; SSE41-NEXT:    retq
   1492 ;
   1493 ; AVX-LABEL: combine_test17:
   1494 ; AVX:       # %bb.0:
   1495 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1496 ; AVX-NEXT:    retq
   1497   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1498   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   1499   ret <4 x i32> %2
   1500 }
   1501 
   1502 define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
   1503 ; SSE-LABEL: combine_test18:
   1504 ; SSE:       # %bb.0:
   1505 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1506 ; SSE-NEXT:    retq
   1507 ;
   1508 ; AVX-LABEL: combine_test18:
   1509 ; AVX:       # %bb.0:
   1510 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1511 ; AVX-NEXT:    retq
   1512   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1513   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1514   ret <4 x i32> %2
   1515 }
   1516 
   1517 define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
   1518 ; SSE-LABEL: combine_test19:
   1519 ; SSE:       # %bb.0:
   1520 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1521 ; SSE-NEXT:    retq
   1522 ;
   1523 ; AVX-LABEL: combine_test19:
   1524 ; AVX:       # %bb.0:
   1525 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1526 ; AVX-NEXT:    retq
   1527   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
   1528   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1529   ret <4 x i32> %2
   1530 }
   1531 
   1532 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
   1533 ; SSE2-LABEL: combine_test20:
   1534 ; SSE2:       # %bb.0:
   1535 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1536 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1537 ; SSE2-NEXT:    retq
   1538 ;
   1539 ; SSSE3-LABEL: combine_test20:
   1540 ; SSSE3:       # %bb.0:
   1541 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1542 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1543 ; SSSE3-NEXT:    retq
   1544 ;
   1545 ; SSE41-LABEL: combine_test20:
   1546 ; SSE41:       # %bb.0:
   1547 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1548 ; SSE41-NEXT:    retq
   1549 ;
   1550 ; AVX-LABEL: combine_test20:
   1551 ; AVX:       # %bb.0:
   1552 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1553 ; AVX-NEXT:    retq
   1554   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1555   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   1556   ret <4 x i32> %2
   1557 }
   1558 
   1559 define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
   1560 ; SSE-LABEL: combine_test21:
   1561 ; SSE:       # %bb.0:
   1562 ; SSE-NEXT:    movaps %xmm0, %xmm2
   1563 ; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
   1564 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1565 ; SSE-NEXT:    movaps %xmm2, (%rdi)
   1566 ; SSE-NEXT:    retq
   1567 ;
   1568 ; AVX-LABEL: combine_test21:
   1569 ; AVX:       # %bb.0:
   1570 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1571 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
   1572 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1573 ; AVX-NEXT:    vmovaps %xmm2, (%rdi)
   1574 ; AVX-NEXT:    vzeroupper
   1575 ; AVX-NEXT:    retq
   1576   %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1577   %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1578   store <4 x i32> %1, <4 x i32>* %ptr, align 16
   1579   ret <4 x i32> %2
   1580 }
   1581 
   1582 define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
   1583 ; SSE-LABEL: combine_test22:
   1584 ; SSE:       # %bb.0:
   1585 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
   1586 ; SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
   1587 ; SSE-NEXT:    retq
   1588 ;
   1589 ; AVX-LABEL: combine_test22:
   1590 ; AVX:       # %bb.0:
   1591 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
   1592 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
   1593 ; AVX-NEXT:    retq
   1594 ; Current AVX2 lowering of this is still awful, not adding a test case.
   1595   %1 = load <2 x float>, <2 x float>* %a, align 8
   1596   %2 = load <2 x float>, <2 x float>* %b, align 8
   1597   %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   1598   ret <8 x float> %3
   1599 }
   1600 
   1601 ; PR22359
   1602 define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
   1603 ; SSE-LABEL: combine_test23:
   1604 ; SSE:       # %bb.0:
   1605 ; SSE-NEXT:    movups %xmm0, (%rdi)
   1606 ; SSE-NEXT:    retq
   1607 ;
   1608 ; AVX-LABEL: combine_test23:
   1609 ; AVX:       # %bb.0:
   1610 ; AVX-NEXT:    vmovups %xmm0, (%rdi)
   1611 ; AVX-NEXT:    vzeroupper
   1612 ; AVX-NEXT:    retq
   1613   %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1
   1614   %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
   1615   %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
   1616   store <2 x float> %shuffle0, <2 x float>* %ptr, align 8
   1617   store <2 x float> %shuffle1, <2 x float>* %idx2, align 8
   1618   ret void
   1619 }
   1620 
   1621 ; Check some negative cases.
   1622 ; FIXME: Do any of these really make sense? Are they redundant with the above tests?
   1623 
   1624 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
   1625 ; SSE-LABEL: combine_test1b:
   1626 ; SSE:       # %bb.0:
   1627 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
   1628 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1629 ; SSE-NEXT:    retq
   1630 ;
   1631 ; AVX-LABEL: combine_test1b:
   1632 ; AVX:       # %bb.0:
   1633 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
   1634 ; AVX-NEXT:    retq
   1635   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1636   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
   1637   ret <4 x float> %2
   1638 }
   1639 
   1640 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
   1641 ; SSE2-LABEL: combine_test2b:
   1642 ; SSE2:       # %bb.0:
   1643 ; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
   1644 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1645 ; SSE2-NEXT:    retq
   1646 ;
   1647 ; SSSE3-LABEL: combine_test2b:
   1648 ; SSSE3:       # %bb.0:
   1649 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
   1650 ; SSSE3-NEXT:    retq
   1651 ;
   1652 ; SSE41-LABEL: combine_test2b:
   1653 ; SSE41:       # %bb.0:
   1654 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
   1655 ; SSE41-NEXT:    retq
   1656 ;
   1657 ; AVX-LABEL: combine_test2b:
   1658 ; AVX:       # %bb.0:
   1659 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
   1660 ; AVX-NEXT:    retq
   1661   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1662   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
   1663   ret <4 x float> %2
   1664 }
   1665 
   1666 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
   1667 ; SSE2-LABEL: combine_test3b:
   1668 ; SSE2:       # %bb.0:
   1669 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
   1670 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
   1671 ; SSE2-NEXT:    retq
   1672 ;
   1673 ; SSSE3-LABEL: combine_test3b:
   1674 ; SSSE3:       # %bb.0:
   1675 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
   1676 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
   1677 ; SSSE3-NEXT:    retq
   1678 ;
   1679 ; SSE41-LABEL: combine_test3b:
   1680 ; SSE41:       # %bb.0:
   1681 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   1682 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
   1683 ; SSE41-NEXT:    retq
   1684 ;
   1685 ; AVX-LABEL: combine_test3b:
   1686 ; AVX:       # %bb.0:
   1687 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   1688 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
   1689 ; AVX-NEXT:    retq
   1690   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
   1691   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
   1692   ret <4 x float> %2
   1693 }
   1694 
   1695 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
   1696 ; SSE-LABEL: combine_test4b:
   1697 ; SSE:       # %bb.0:
   1698 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
   1699 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1700 ; SSE-NEXT:    retq
   1701 ;
   1702 ; AVX-LABEL: combine_test4b:
   1703 ; AVX:       # %bb.0:
   1704 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1705 ; AVX-NEXT:    retq
   1706   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1707   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
   1708   ret <4 x float> %2
   1709 }
   1710 
   1711 
   1712 ; Verify that we correctly fold shuffles even when we use illegal vector types.
   1713 
   1714 define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
   1715 ; SSE2-LABEL: combine_test1c:
   1716 ; SSE2:       # %bb.0:
   1717 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1718 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1719 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1720 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1721 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1722 ; SSE2-NEXT:    retq
   1723 ;
   1724 ; SSSE3-LABEL: combine_test1c:
   1725 ; SSSE3:       # %bb.0:
   1726 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1727 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1728 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1729 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1730 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1731 ; SSSE3-NEXT:    retq
   1732 ;
   1733 ; SSE41-LABEL: combine_test1c:
   1734 ; SSE41:       # %bb.0:
   1735 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1736 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1737 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
   1738 ; SSE41-NEXT:    retq
   1739 ;
   1740 ; AVX1-LABEL: combine_test1c:
   1741 ; AVX1:       # %bb.0:
   1742 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1743 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1744 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1745 ; AVX1-NEXT:    retq
   1746 ;
   1747 ; AVX2-LABEL: combine_test1c:
   1748 ; AVX2:       # %bb.0:
   1749 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1750 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1751 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1752 ; AVX2-NEXT:    retq
   1753   %A = load <4 x i8>, <4 x i8>* %a
   1754   %B = load <4 x i8>, <4 x i8>* %b
   1755   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1756   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1757   ret <4 x i8> %2
   1758 }
   1759 
   1760 define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
   1761 ; SSE2-LABEL: combine_test2c:
   1762 ; SSE2:       # %bb.0:
   1763 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1764 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1765 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1766 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1767 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1768 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1769 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1770 ; SSE2-NEXT:    retq
   1771 ;
   1772 ; SSSE3-LABEL: combine_test2c:
   1773 ; SSSE3:       # %bb.0:
   1774 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1775 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1776 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1777 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1778 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1779 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1780 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1781 ; SSSE3-NEXT:    retq
   1782 ;
   1783 ; SSE41-LABEL: combine_test2c:
   1784 ; SSE41:       # %bb.0:
   1785 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1786 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1787 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1788 ; SSE41-NEXT:    retq
   1789 ;
   1790 ; AVX-LABEL: combine_test2c:
   1791 ; AVX:       # %bb.0:
   1792 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1793 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1794 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1795 ; AVX-NEXT:    retq
   1796   %A = load <4 x i8>, <4 x i8>* %a
   1797   %B = load <4 x i8>, <4 x i8>* %b
   1798   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
   1799   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1800   ret <4 x i8> %2
   1801 }
   1802 
   1803 define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
   1804 ; SSE2-LABEL: combine_test3c:
   1805 ; SSE2:       # %bb.0:
   1806 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1807 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1808 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1809 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1810 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1811 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1812 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1813 ; SSE2-NEXT:    retq
   1814 ;
   1815 ; SSSE3-LABEL: combine_test3c:
   1816 ; SSSE3:       # %bb.0:
   1817 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1818 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1819 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1820 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1821 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1822 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1823 ; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1824 ; SSSE3-NEXT:    retq
   1825 ;
   1826 ; SSE41-LABEL: combine_test3c:
   1827 ; SSE41:       # %bb.0:
   1828 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1829 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1830 ; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1831 ; SSE41-NEXT:    retq
   1832 ;
   1833 ; AVX-LABEL: combine_test3c:
   1834 ; AVX:       # %bb.0:
   1835 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1836 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1837 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1838 ; AVX-NEXT:    retq
   1839   %A = load <4 x i8>, <4 x i8>* %a
   1840   %B = load <4 x i8>, <4 x i8>* %b
   1841   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   1842   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1843   ret <4 x i8> %2
   1844 }
   1845 
   1846 define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
   1847 ; SSE2-LABEL: combine_test4c:
   1848 ; SSE2:       # %bb.0:
   1849 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1850 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1851 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1852 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1853 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1854 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1855 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1856 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1857 ; SSE2-NEXT:    retq
   1858 ;
   1859 ; SSSE3-LABEL: combine_test4c:
   1860 ; SSSE3:       # %bb.0:
   1861 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1862 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1863 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1864 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1865 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1866 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1867 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1868 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1869 ; SSSE3-NEXT:    retq
   1870 ;
   1871 ; SSE41-LABEL: combine_test4c:
   1872 ; SSE41:       # %bb.0:
   1873 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1874 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1875 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1876 ; SSE41-NEXT:    retq
   1877 ;
   1878 ; AVX1-LABEL: combine_test4c:
   1879 ; AVX1:       # %bb.0:
   1880 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1881 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1882 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1883 ; AVX1-NEXT:    retq
   1884 ;
   1885 ; AVX2-LABEL: combine_test4c:
   1886 ; AVX2:       # %bb.0:
   1887 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1888 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1889 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1890 ; AVX2-NEXT:    retq
   1891   %A = load <4 x i8>, <4 x i8>* %a
   1892   %B = load <4 x i8>, <4 x i8>* %b
   1893   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1894   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1895   ret <4 x i8> %2
   1896 }
   1897 
   1898 
   1899 ; The following test cases are generated from this C++ code
   1900 ;
   1901 ;__m128 blend_01(__m128 a, __m128 b)
   1902 ;{
   1903 ;  __m128 s = a;
   1904 ;  s = _mm_blend_ps( s, b, 1<<0 );
   1905 ;  s = _mm_blend_ps( s, b, 1<<1 );
   1906 ;  return s;
   1907 ;}
   1908 ;
   1909 ;__m128 blend_02(__m128 a, __m128 b)
   1910 ;{
   1911 ;  __m128 s = a;
   1912 ;  s = _mm_blend_ps( s, b, 1<<0 );
   1913 ;  s = _mm_blend_ps( s, b, 1<<2 );
   1914 ;  return s;
   1915 ;}
   1916 ;
   1917 ;__m128 blend_123(__m128 a, __m128 b)
   1918 ;{
   1919 ;  __m128 s = a;
   1920 ;  s = _mm_blend_ps( s, b, 1<<1 );
   1921 ;  s = _mm_blend_ps( s, b, 1<<2 );
   1922 ;  s = _mm_blend_ps( s, b, 1<<3 );
   1923 ;  return s;
   1924 ;}
   1925 
   1926 ; Ideally, we should collapse the following shuffles into a single one.
   1927 
   1928 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
   1929 ; SSE2-LABEL: combine_blend_01:
   1930 ; SSE2:       # %bb.0:
   1931 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1932 ; SSE2-NEXT:    retq
   1933 ;
   1934 ; SSSE3-LABEL: combine_blend_01:
   1935 ; SSSE3:       # %bb.0:
   1936 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1937 ; SSSE3-NEXT:    retq
   1938 ;
   1939 ; SSE41-LABEL: combine_blend_01:
   1940 ; SSE41:       # %bb.0:
   1941 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   1942 ; SSE41-NEXT:    retq
   1943 ;
   1944 ; AVX-LABEL: combine_blend_01:
   1945 ; AVX:       # %bb.0:
   1946 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   1947 ; AVX-NEXT:    retq
   1948   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
   1949   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   1950   ret <4 x float> %shuffle6
   1951 }
   1952 
   1953 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
   1954 ; SSE2-LABEL: combine_blend_02:
   1955 ; SSE2:       # %bb.0:
   1956 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
   1957 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
   1958 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1959 ; SSE2-NEXT:    retq
   1960 ;
   1961 ; SSSE3-LABEL: combine_blend_02:
   1962 ; SSSE3:       # %bb.0:
   1963 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
   1964 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
   1965 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1966 ; SSSE3-NEXT:    retq
   1967 ;
   1968 ; SSE41-LABEL: combine_blend_02:
   1969 ; SSE41:       # %bb.0:
   1970 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   1971 ; SSE41-NEXT:    retq
   1972 ;
   1973 ; AVX-LABEL: combine_blend_02:
   1974 ; AVX:       # %bb.0:
   1975 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   1976 ; AVX-NEXT:    retq
   1977   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
   1978   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1979   ret <4 x float> %shuffle6
   1980 }
   1981 
   1982 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
   1983 ; SSE2-LABEL: combine_blend_123:
   1984 ; SSE2:       # %bb.0:
   1985 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1986 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1987 ; SSE2-NEXT:    retq
   1988 ;
   1989 ; SSSE3-LABEL: combine_blend_123:
   1990 ; SSSE3:       # %bb.0:
   1991 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1992 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1993 ; SSSE3-NEXT:    retq
   1994 ;
   1995 ; SSE41-LABEL: combine_blend_123:
   1996 ; SSE41:       # %bb.0:
   1997 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1998 ; SSE41-NEXT:    retq
   1999 ;
   2000 ; AVX-LABEL: combine_blend_123:
   2001 ; AVX:       # %bb.0:
   2002 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   2003 ; AVX-NEXT:    retq
   2004   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
   2005   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
   2006   %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   2007   ret <4 x float> %shuffle12
   2008 }
   2009 
   2010 define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
   2011 ; SSE-LABEL: combine_test_movhl_1:
   2012 ; SSE:       # %bb.0:
   2013 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2014 ; SSE-NEXT:    movaps %xmm1, %xmm0
   2015 ; SSE-NEXT:    retq
   2016 ;
   2017 ; AVX-LABEL: combine_test_movhl_1:
   2018 ; AVX:       # %bb.0:
   2019 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2020 ; AVX-NEXT:    retq
   2021   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
   2022   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
   2023   ret <4 x i32> %2
   2024 }
   2025 
   2026 define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
   2027 ; SSE-LABEL: combine_test_movhl_2:
   2028 ; SSE:       # %bb.0:
   2029 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2030 ; SSE-NEXT:    movaps %xmm1, %xmm0
   2031 ; SSE-NEXT:    retq
   2032 ;
   2033 ; AVX-LABEL: combine_test_movhl_2:
   2034 ; AVX:       # %bb.0:
   2035 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2036 ; AVX-NEXT:    retq
   2037   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
   2038   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
   2039   ret <4 x i32> %2
   2040 }
   2041 
   2042 define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
   2043 ; SSE-LABEL: combine_test_movhl_3:
   2044 ; SSE:       # %bb.0:
   2045 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2046 ; SSE-NEXT:    movaps %xmm1, %xmm0
   2047 ; SSE-NEXT:    retq
   2048 ;
   2049 ; AVX-LABEL: combine_test_movhl_3:
   2050 ; AVX:       # %bb.0:
   2051 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2052 ; AVX-NEXT:    retq
   2053   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
   2054   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
   2055   ret <4 x i32> %2
   2056 }
   2057 
   2058 
   2059 ; Verify that we fold shuffles according to rule:
   2060 ;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
   2061 
   2062 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
   2063 ; SSE2-LABEL: combine_undef_input_test1:
   2064 ; SSE2:       # %bb.0:
   2065 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2066 ; SSE2-NEXT:    retq
   2067 ;
   2068 ; SSSE3-LABEL: combine_undef_input_test1:
   2069 ; SSSE3:       # %bb.0:
   2070 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2071 ; SSSE3-NEXT:    retq
   2072 ;
   2073 ; SSE41-LABEL: combine_undef_input_test1:
   2074 ; SSE41:       # %bb.0:
   2075 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   2076 ; SSE41-NEXT:    retq
   2077 ;
   2078 ; AVX-LABEL: combine_undef_input_test1:
   2079 ; AVX:       # %bb.0:
   2080 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   2081 ; AVX-NEXT:    retq
   2082   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2083   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
   2084   ret <4 x float> %2
   2085 }
   2086 
   2087 define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
   2088 ; SSE-LABEL: combine_undef_input_test2:
   2089 ; SSE:       # %bb.0:
   2090 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2091 ; SSE-NEXT:    retq
   2092 ;
   2093 ; AVX-LABEL: combine_undef_input_test2:
   2094 ; AVX:       # %bb.0:
   2095 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2096 ; AVX-NEXT:    retq
   2097   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2098   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   2099   ret <4 x float> %2
   2100 }
   2101 
   2102 define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
   2103 ; SSE-LABEL: combine_undef_input_test3:
   2104 ; SSE:       # %bb.0:
   2105 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2106 ; SSE-NEXT:    retq
   2107 ;
   2108 ; AVX-LABEL: combine_undef_input_test3:
   2109 ; AVX:       # %bb.0:
   2110 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2111 ; AVX-NEXT:    retq
   2112   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2113   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   2114   ret <4 x float> %2
   2115 }
   2116 
   2117 define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
   2118 ; SSE-LABEL: combine_undef_input_test4:
   2119 ; SSE:       # %bb.0:
   2120 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2121 ; SSE-NEXT:    retq
   2122 ;
   2123 ; AVX-LABEL: combine_undef_input_test4:
   2124 ; AVX:       # %bb.0:
   2125 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2126 ; AVX-NEXT:    retq
   2127   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2128   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   2129   ret <4 x float> %2
   2130 }
   2131 
   2132 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
   2133 ; SSE2-LABEL: combine_undef_input_test5:
   2134 ; SSE2:       # %bb.0:
   2135 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2136 ; SSE2-NEXT:    movapd %xmm1, %xmm0
   2137 ; SSE2-NEXT:    retq
   2138 ;
   2139 ; SSSE3-LABEL: combine_undef_input_test5:
   2140 ; SSSE3:       # %bb.0:
   2141 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2142 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
   2143 ; SSSE3-NEXT:    retq
   2144 ;
   2145 ; SSE41-LABEL: combine_undef_input_test5:
   2146 ; SSE41:       # %bb.0:
   2147 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   2148 ; SSE41-NEXT:    retq
   2149 ;
   2150 ; AVX-LABEL: combine_undef_input_test5:
   2151 ; AVX:       # %bb.0:
   2152 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   2153 ; AVX-NEXT:    retq
   2154   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2155   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
   2156   ret <4 x float> %2
   2157 }
   2158 
   2159 
   2160 ; Verify that we fold shuffles according to rule:
   2161 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
   2162 
   2163 define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
   2164 ; ALL-LABEL: combine_undef_input_test6:
   2165 ; ALL:       # %bb.0:
   2166 ; ALL-NEXT:    retq
   2167   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2168   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
   2169   ret <4 x float> %2
   2170 }
   2171 
   2172 define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
   2173 ; SSE2-LABEL: combine_undef_input_test7:
   2174 ; SSE2:       # %bb.0:
   2175 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2176 ; SSE2-NEXT:    retq
   2177 ;
   2178 ; SSSE3-LABEL: combine_undef_input_test7:
   2179 ; SSSE3:       # %bb.0:
   2180 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2181 ; SSSE3-NEXT:    retq
   2182 ;
   2183 ; SSE41-LABEL: combine_undef_input_test7:
   2184 ; SSE41:       # %bb.0:
   2185 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2186 ; SSE41-NEXT:    retq
   2187 ;
   2188 ; AVX-LABEL: combine_undef_input_test7:
   2189 ; AVX:       # %bb.0:
   2190 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2191 ; AVX-NEXT:    retq
   2192   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2193   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   2194   ret <4 x float> %2
   2195 }
   2196 
   2197 define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
   2198 ; SSE2-LABEL: combine_undef_input_test8:
   2199 ; SSE2:       # %bb.0:
   2200 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2201 ; SSE2-NEXT:    retq
   2202 ;
   2203 ; SSSE3-LABEL: combine_undef_input_test8:
   2204 ; SSSE3:       # %bb.0:
   2205 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2206 ; SSSE3-NEXT:    retq
   2207 ;
   2208 ; SSE41-LABEL: combine_undef_input_test8:
   2209 ; SSE41:       # %bb.0:
   2210 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2211 ; SSE41-NEXT:    retq
   2212 ;
   2213 ; AVX-LABEL: combine_undef_input_test8:
   2214 ; AVX:       # %bb.0:
   2215 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2216 ; AVX-NEXT:    retq
   2217   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2218   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   2219   ret <4 x float> %2
   2220 }
   2221 
   2222 define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
   2223 ; SSE-LABEL: combine_undef_input_test9:
   2224 ; SSE:       # %bb.0:
   2225 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
   2226 ; SSE-NEXT:    retq
   2227 ;
   2228 ; AVX-LABEL: combine_undef_input_test9:
   2229 ; AVX:       # %bb.0:
   2230 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
   2231 ; AVX-NEXT:    retq
   2232   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2233   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   2234   ret <4 x float> %2
   2235 }
   2236 
   2237 define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
   2238 ; ALL-LABEL: combine_undef_input_test10:
   2239 ; ALL:       # %bb.0:
   2240 ; ALL-NEXT:    retq
   2241   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2242   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
   2243   ret <4 x float> %2
   2244 }
   2245 
   2246 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
   2247 ; SSE2-LABEL: combine_undef_input_test11:
   2248 ; SSE2:       # %bb.0:
   2249 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2250 ; SSE2-NEXT:    retq
   2251 ;
   2252 ; SSSE3-LABEL: combine_undef_input_test11:
   2253 ; SSSE3:       # %bb.0:
   2254 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2255 ; SSSE3-NEXT:    retq
   2256 ;
   2257 ; SSE41-LABEL: combine_undef_input_test11:
   2258 ; SSE41:       # %bb.0:
   2259 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   2260 ; SSE41-NEXT:    retq
   2261 ;
   2262 ; AVX-LABEL: combine_undef_input_test11:
   2263 ; AVX:       # %bb.0:
   2264 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   2265 ; AVX-NEXT:    retq
   2266   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2267   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
   2268   ret <4 x float> %2
   2269 }
   2270 
   2271 define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
   2272 ; SSE-LABEL: combine_undef_input_test12:
   2273 ; SSE:       # %bb.0:
   2274 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2275 ; SSE-NEXT:    retq
   2276 ;
   2277 ; AVX-LABEL: combine_undef_input_test12:
   2278 ; AVX:       # %bb.0:
   2279 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2280 ; AVX-NEXT:    retq
   2281   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2282   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
   2283   ret <4 x float> %2
   2284 }
   2285 
   2286 define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
   2287 ; SSE-LABEL: combine_undef_input_test13:
   2288 ; SSE:       # %bb.0:
   2289 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2290 ; SSE-NEXT:    retq
   2291 ;
   2292 ; AVX-LABEL: combine_undef_input_test13:
   2293 ; AVX:       # %bb.0:
   2294 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2295 ; AVX-NEXT:    retq
   2296   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2297   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
   2298   ret <4 x float> %2
   2299 }
   2300 
   2301 define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
   2302 ; SSE-LABEL: combine_undef_input_test14:
   2303 ; SSE:       # %bb.0:
   2304 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2305 ; SSE-NEXT:    retq
   2306 ;
   2307 ; AVX-LABEL: combine_undef_input_test14:
   2308 ; AVX:       # %bb.0:
   2309 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2310 ; AVX-NEXT:    retq
   2311   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2312   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   2313   ret <4 x float> %2
   2314 }
   2315 
   2316 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
   2317 ; SSE2-LABEL: combine_undef_input_test15:
   2318 ; SSE2:       # %bb.0:
   2319 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2320 ; SSE2-NEXT:    movapd %xmm1, %xmm0
   2321 ; SSE2-NEXT:    retq
   2322 ;
   2323 ; SSSE3-LABEL: combine_undef_input_test15:
   2324 ; SSSE3:       # %bb.0:
   2325 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2326 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
   2327 ; SSSE3-NEXT:    retq
   2328 ;
   2329 ; SSE41-LABEL: combine_undef_input_test15:
   2330 ; SSE41:       # %bb.0:
   2331 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   2332 ; SSE41-NEXT:    retq
   2333 ;
   2334 ; AVX-LABEL: combine_undef_input_test15:
   2335 ; AVX:       # %bb.0:
   2336 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   2337 ; AVX-NEXT:    retq
   2338   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2339   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2340   ret <4 x float> %2
   2341 }
   2342 
   2343 
   2344 ; Verify that shuffles are canonicalized according to rules:
   2345 ;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
   2346 ;
   2347 ; This allows to trigger the following combine rule:
   2348 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
   2349 ;
   2350 ; As a result, all the shuffle pairs in each function below should be
   2351 ; combined into a single legal shuffle operation.
   2352 
   2353 define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
   2354 ; ALL-LABEL: combine_undef_input_test16:
   2355 ; ALL:       # %bb.0:
   2356 ; ALL-NEXT:    retq
   2357   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2358   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
   2359   ret <4 x float> %2
   2360 }
   2361 
   2362 define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
   2363 ; SSE2-LABEL: combine_undef_input_test17:
   2364 ; SSE2:       # %bb.0:
   2365 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2366 ; SSE2-NEXT:    retq
   2367 ;
   2368 ; SSSE3-LABEL: combine_undef_input_test17:
   2369 ; SSSE3:       # %bb.0:
   2370 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2371 ; SSSE3-NEXT:    retq
   2372 ;
   2373 ; SSE41-LABEL: combine_undef_input_test17:
   2374 ; SSE41:       # %bb.0:
   2375 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2376 ; SSE41-NEXT:    retq
   2377 ;
   2378 ; AVX-LABEL: combine_undef_input_test17:
   2379 ; AVX:       # %bb.0:
   2380 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2381 ; AVX-NEXT:    retq
   2382   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2383   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
   2384   ret <4 x float> %2
   2385 }
   2386 
   2387 define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
   2388 ; SSE2-LABEL: combine_undef_input_test18:
   2389 ; SSE2:       # %bb.0:
   2390 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2391 ; SSE2-NEXT:    retq
   2392 ;
   2393 ; SSSE3-LABEL: combine_undef_input_test18:
   2394 ; SSSE3:       # %bb.0:
   2395 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2396 ; SSSE3-NEXT:    retq
   2397 ;
   2398 ; SSE41-LABEL: combine_undef_input_test18:
   2399 ; SSE41:       # %bb.0:
   2400 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2401 ; SSE41-NEXT:    retq
   2402 ;
   2403 ; AVX-LABEL: combine_undef_input_test18:
   2404 ; AVX:       # %bb.0:
   2405 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2406 ; AVX-NEXT:    retq
   2407   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2408   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   2409   ret <4 x float> %2
   2410 }
   2411 
   2412 define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
   2413 ; SSE-LABEL: combine_undef_input_test19:
   2414 ; SSE:       # %bb.0:
   2415 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
   2416 ; SSE-NEXT:    retq
   2417 ;
   2418 ; AVX-LABEL: combine_undef_input_test19:
   2419 ; AVX:       # %bb.0:
   2420 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
   2421 ; AVX-NEXT:    retq
   2422   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2423   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   2424   ret <4 x float> %2
   2425 }
   2426 
   2427 define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
   2428 ; ALL-LABEL: combine_undef_input_test20:
   2429 ; ALL:       # %bb.0:
   2430 ; ALL-NEXT:    retq
   2431   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2432   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2433   ret <4 x float> %2
   2434 }
   2435 
   2436 ; These tests are designed to test the ability to combine away unnecessary
   2437 ; operations feeding into a shuffle. The AVX cases are the important ones as
   2438 ; they leverage operations which cannot be done naturally on the entire vector
   2439 ; and thus are decomposed into multiple smaller operations.
   2440 
   2441 define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
   2442 ; SSE-LABEL: combine_unneeded_subvector1:
   2443 ; SSE:       # %bb.0:
   2444 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
   2445 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
   2446 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   2447 ; SSE-NEXT:    retq
   2448 ;
   2449 ; AVX1-LABEL: combine_unneeded_subvector1:
   2450 ; AVX1:       # %bb.0:
   2451 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2452 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
   2453 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   2454 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   2455 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
   2456 ; AVX1-NEXT:    retq
   2457 ;
   2458 ; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
   2459 ; AVX2-SLOW:       # %bb.0:
   2460 ; AVX2-SLOW-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
   2461 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   2462 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
   2463 ; AVX2-SLOW-NEXT:    retq
   2464 ;
   2465 ; AVX2-FAST-LABEL: combine_unneeded_subvector1:
   2466 ; AVX2-FAST:       # %bb.0:
   2467 ; AVX2-FAST-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
   2468 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
   2469 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
   2470 ; AVX2-FAST-NEXT:    retq
   2471   %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   2472   %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
   2473   ret <8 x i32> %c
   2474 }
   2475 
   2476 define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
   2477 ; SSE-LABEL: combine_unneeded_subvector2:
   2478 ; SSE:       # %bb.0:
   2479 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
   2480 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
   2481 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
   2482 ; SSE-NEXT:    retq
   2483 ;
   2484 ; AVX1-LABEL: combine_unneeded_subvector2:
   2485 ; AVX1:       # %bb.0:
   2486 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2487 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
   2488 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   2489 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
   2490 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   2491 ; AVX1-NEXT:    retq
   2492 ;
   2493 ; AVX2-LABEL: combine_unneeded_subvector2:
   2494 ; AVX2:       # %bb.0:
   2495 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
   2496 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
   2497 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   2498 ; AVX2-NEXT:    retq
   2499   %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   2500   %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
   2501   ret <8 x i32> %d
   2502 }
   2503 
   2504 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
   2505 ; SSE2-LABEL: combine_insertps1:
   2506 ; SSE2:       # %bb.0:
   2507 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
   2508 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
   2509 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2510 ; SSE2-NEXT:    retq
   2511 ;
   2512 ; SSSE3-LABEL: combine_insertps1:
   2513 ; SSSE3:       # %bb.0:
   2514 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
   2515 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
   2516 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2517 ; SSSE3-NEXT:    retq
   2518 ;
   2519 ; SSE41-LABEL: combine_insertps1:
   2520 ; SSE41:       # %bb.0:
   2521 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
   2522 ; SSE41-NEXT:    retq
   2523 ;
   2524 ; AVX-LABEL: combine_insertps1:
   2525 ; AVX:       # %bb.0:
   2526 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
   2527 ; AVX-NEXT:    retq
   2528 
   2529   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
   2530   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
   2531   ret <4 x float> %d
   2532 }
   2533 
   2534 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
   2535 ; SSE2-LABEL: combine_insertps2:
   2536 ; SSE2:       # %bb.0:
   2537 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
   2538 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
   2539 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2540 ; SSE2-NEXT:    retq
   2541 ;
   2542 ; SSSE3-LABEL: combine_insertps2:
   2543 ; SSSE3:       # %bb.0:
   2544 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
   2545 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
   2546 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2547 ; SSSE3-NEXT:    retq
   2548 ;
   2549 ; SSE41-LABEL: combine_insertps2:
   2550 ; SSE41:       # %bb.0:
   2551 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
   2552 ; SSE41-NEXT:    retq
   2553 ;
   2554 ; AVX-LABEL: combine_insertps2:
   2555 ; AVX:       # %bb.0:
   2556 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
   2557 ; AVX-NEXT:    retq
   2558 
   2559   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
   2560   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2561   ret <4 x float> %d
   2562 }
   2563 
   2564 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
   2565 ; SSE2-LABEL: combine_insertps3:
   2566 ; SSE2:       # %bb.0:
   2567 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
   2568 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
   2569 ; SSE2-NEXT:    retq
   2570 ;
   2571 ; SSSE3-LABEL: combine_insertps3:
   2572 ; SSSE3:       # %bb.0:
   2573 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
   2574 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
   2575 ; SSSE3-NEXT:    retq
   2576 ;
   2577 ; SSE41-LABEL: combine_insertps3:
   2578 ; SSE41:       # %bb.0:
   2579 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   2580 ; SSE41-NEXT:    retq
   2581 ;
   2582 ; AVX-LABEL: combine_insertps3:
   2583 ; AVX:       # %bb.0:
   2584 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   2585 ; AVX-NEXT:    retq
   2586 
   2587   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
   2588   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
   2589   ret <4 x float> %d
   2590 }
   2591 
   2592 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
   2593 ; SSE2-LABEL: combine_insertps4:
   2594 ; SSE2:       # %bb.0:
   2595 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
   2596 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
   2597 ; SSE2-NEXT:    retq
   2598 ;
   2599 ; SSSE3-LABEL: combine_insertps4:
   2600 ; SSSE3:       # %bb.0:
   2601 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
   2602 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
   2603 ; SSSE3-NEXT:    retq
   2604 ;
   2605 ; SSE41-LABEL: combine_insertps4:
   2606 ; SSE41:       # %bb.0:
   2607 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   2608 ; SSE41-NEXT:    retq
   2609 ;
   2610 ; AVX-LABEL: combine_insertps4:
   2611 ; AVX:       # %bb.0:
   2612 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   2613 ; AVX-NEXT:    retq
   2614 
   2615   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
   2616   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
   2617   ret <4 x float> %d
   2618 }
   2619 
   2620 define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
   2621 ; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
   2622 ; SSE:       # %bb.0:
   2623 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
   2624 ; SSE-NEXT:    movaps %xmm0, (%rsi)
   2625 ; SSE-NEXT:    retq
   2626 ;
   2627 ; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
   2628 ; AVX:       # %bb.0:
   2629 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
   2630 ; AVX-NEXT:    vmovaps %xmm0, (%rsi)
   2631 ; AVX-NEXT:    retq
   2632   %1 = load double, double* %a0, align 8
   2633   %2 = insertelement <2 x double> undef, double %1, i32 0
   2634   %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
   2635   %4 = bitcast <2 x double> %3 to <4 x float>
   2636   %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
   2637   store <4 x float> %5, <4 x float>* %a1, align 16
   2638   ret void
   2639 }
   2640 
   2641 ; PR30371
   2642 define <4 x float> @combine_constant_insertion_v4f32(float %f) {
   2643 ; SSE2-LABEL: combine_constant_insertion_v4f32:
   2644 ; SSE2:       # %bb.0:
   2645 ; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4,5,3>
   2646 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   2647 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2648 ; SSE2-NEXT:    retq
   2649 ;
   2650 ; SSSE3-LABEL: combine_constant_insertion_v4f32:
   2651 ; SSSE3:       # %bb.0:
   2652 ; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4,5,3>
   2653 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   2654 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2655 ; SSSE3-NEXT:    retq
   2656 ;
   2657 ; SSE41-LABEL: combine_constant_insertion_v4f32:
   2658 ; SSE41:       # %bb.0:
   2659 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
   2660 ; SSE41-NEXT:    retq
   2661 ;
   2662 ; AVX-LABEL: combine_constant_insertion_v4f32:
   2663 ; AVX:       # %bb.0:
   2664 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
   2665 ; AVX-NEXT:    retq
   2666   %a0 = insertelement <4 x float> undef, float %f, i32 0
   2667   %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   2668   ret <4 x float> %ret
   2669 }
   2670 
   2671 define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
   2672 ; SSE2-LABEL: combine_constant_insertion_v4i32:
   2673 ; SSE2:       # %bb.0:
   2674 ; SSE2-NEXT:    movd %edi, %xmm1
   2675 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
   2676 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   2677 ; SSE2-NEXT:    retq
   2678 ;
   2679 ; SSSE3-LABEL: combine_constant_insertion_v4i32:
   2680 ; SSSE3:       # %bb.0:
   2681 ; SSSE3-NEXT:    movd %edi, %xmm1
   2682 ; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
   2683 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   2684 ; SSSE3-NEXT:    retq
   2685 ;
   2686 ; SSE41-LABEL: combine_constant_insertion_v4i32:
   2687 ; SSE41:       # %bb.0:
   2688 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <u,4,5,30>
   2689 ; SSE41-NEXT:    pinsrd $0, %edi, %xmm0
   2690 ; SSE41-NEXT:    retq
   2691 ;
   2692 ; AVX-LABEL: combine_constant_insertion_v4i32:
   2693 ; AVX:       # %bb.0:
   2694 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,4,5,30>
   2695 ; AVX-NEXT:    vpinsrd $0, %edi, %xmm0, %xmm0
   2696 ; AVX-NEXT:    retq
   2697   %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
   2698   %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   2699   ret <4 x i32> %ret
   2700 }
   2701 
   2702 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
   2703 ; SSE-LABEL: PR22377:
   2704 ; SSE:       # %bb.0: # %entry
   2705 ; SSE-NEXT:    movaps %xmm0, %xmm1
   2706 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
   2707 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
   2708 ; SSE-NEXT:    addps %xmm0, %xmm1
   2709 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2710 ; SSE-NEXT:    retq
   2711 ;
   2712 ; AVX-LABEL: PR22377:
   2713 ; AVX:       # %bb.0: # %entry
   2714 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
   2715 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
   2716 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
   2717 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2718 ; AVX-NEXT:    retq
   2719 entry:
   2720   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
   2721   %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
   2722   %r2 = fadd <4 x float> %s1, %s2
   2723   %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   2724   ret <4 x float> %s3
   2725 }
   2726 
   2727 define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
   2728 ; SSE2-LABEL: PR22390:
   2729 ; SSE2:       # %bb.0: # %entry
   2730 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2731 ; SSE2-NEXT:    movaps %xmm0, %xmm2
   2732 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
   2733 ; SSE2-NEXT:    addps %xmm0, %xmm2
   2734 ; SSE2-NEXT:    movaps %xmm2, %xmm0
   2735 ; SSE2-NEXT:    retq
   2736 ;
   2737 ; SSSE3-LABEL: PR22390:
   2738 ; SSSE3:       # %bb.0: # %entry
   2739 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2740 ; SSSE3-NEXT:    movaps %xmm0, %xmm2
   2741 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
   2742 ; SSSE3-NEXT:    addps %xmm0, %xmm2
   2743 ; SSSE3-NEXT:    movaps %xmm2, %xmm0
   2744 ; SSSE3-NEXT:    retq
   2745 ;
   2746 ; SSE41-LABEL: PR22390:
   2747 ; SSE41:       # %bb.0: # %entry
   2748 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2749 ; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
   2750 ; SSE41-NEXT:    addps %xmm1, %xmm0
   2751 ; SSE41-NEXT:    retq
   2752 ;
   2753 ; AVX-LABEL: PR22390:
   2754 ; AVX:       # %bb.0: # %entry
   2755 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2756 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
   2757 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   2758 ; AVX-NEXT:    retq
   2759 entry:
   2760   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   2761   %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   2762   %r2 = fadd <4 x float> %s1, %s2
   2763   ret <4 x float> %r2
   2764 }
   2765 
   2766 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
   2767 ; SSE2-LABEL: PR22412:
   2768 ; SSE2:       # %bb.0: # %entry
   2769 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
   2770 ; SSE2-NEXT:    movapd %xmm2, %xmm0
   2771 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
   2772 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
   2773 ; SSE2-NEXT:    movaps %xmm3, %xmm1
   2774 ; SSE2-NEXT:    retq
   2775 ;
   2776 ; SSSE3-LABEL: PR22412:
   2777 ; SSSE3:       # %bb.0: # %entry
   2778 ; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
   2779 ; SSSE3-NEXT:    movapd %xmm2, %xmm0
   2780 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
   2781 ; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
   2782 ; SSSE3-NEXT:    movaps %xmm3, %xmm1
   2783 ; SSSE3-NEXT:    retq
   2784 ;
   2785 ; SSE41-LABEL: PR22412:
   2786 ; SSE41:       # %bb.0: # %entry
   2787 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
   2788 ; SSE41-NEXT:    movaps %xmm0, %xmm1
   2789 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
   2790 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
   2791 ; SSE41-NEXT:    movaps %xmm1, %xmm0
   2792 ; SSE41-NEXT:    movaps %xmm3, %xmm1
   2793 ; SSE41-NEXT:    retq
   2794 ;
   2795 ; AVX1-LABEL: PR22412:
   2796 ; AVX1:       # %bb.0: # %entry
   2797 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
   2798 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
   2799 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
   2800 ; AVX1-NEXT:    retq
   2801 ;
   2802 ; AVX2-SLOW-LABEL: PR22412:
   2803 ; AVX2-SLOW:       # %bb.0: # %entry
   2804 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
   2805 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
   2806 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
   2807 ; AVX2-SLOW-NEXT:    retq
   2808 ;
   2809 ; AVX2-FAST-LABEL: PR22412:
   2810 ; AVX2-FAST:       # %bb.0: # %entry
   2811 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
   2812 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
   2813 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   2814 ; AVX2-FAST-NEXT:    retq
   2815 entry:
   2816   %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   2817   %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
   2818   ret <8 x float> %s2
   2819 }
   2820 
   2821 define <4 x float> @PR30264(<4 x float> %x) {
   2822 ; SSE2-LABEL: PR30264:
   2823 ; SSE2:       # %bb.0:
   2824 ; SSE2-NEXT:    xorps %xmm1, %xmm1
   2825 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
   2826 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
   2827 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2828 ; SSE2-NEXT:    retq
   2829 ;
   2830 ; SSSE3-LABEL: PR30264:
   2831 ; SSSE3:       # %bb.0:
   2832 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
   2833 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
   2834 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
   2835 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2836 ; SSSE3-NEXT:    retq
   2837 ;
   2838 ; SSE41-LABEL: PR30264:
   2839 ; SSE41:       # %bb.0:
   2840 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4,1>
   2841 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
   2842 ; SSE41-NEXT:    movaps %xmm1, %xmm0
   2843 ; SSE41-NEXT:    retq
   2844 ;
   2845 ; AVX-LABEL: PR30264:
   2846 ; AVX:       # %bb.0:
   2847 ; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <u,u,4,1>
   2848 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
   2849 ; AVX-NEXT:    retq
   2850   %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
   2851   %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
   2852   ret <4 x float> %shuf2
   2853 }
   2854