Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      4 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      5 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      6 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      7 ;
      8 ; Verify that the DAG combiner correctly folds bitwise operations across
      9 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
     10 ; basic and always-safe patterns. Also test that the DAG combiner will combine
     11 ; target-specific shuffle instructions where reasonable.
     12 
     13 target triple = "x86_64-unknown-unknown"
     14 
     15 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
     16 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
     17 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
     18 
     19 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
     20 ; ALL-LABEL: combine_pshufd1:
     21 ; ALL:       # BB#0: # %entry
     22 ; ALL-NEXT:    retq
     23 entry:
     24   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     25   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
     26   ret <4 x i32> %c
     27 }
     28 
     29 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
     30 ; ALL-LABEL: combine_pshufd2:
     31 ; ALL:       # BB#0: # %entry
     32 ; ALL-NEXT:    retq
     33 entry:
     34   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     35   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     36   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
     37   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     38   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
     39   ret <4 x i32> %d
     40 }
     41 
     42 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
     43 ; ALL-LABEL: combine_pshufd3:
     44 ; ALL:       # BB#0: # %entry
     45 ; ALL-NEXT:    retq
     46 entry:
     47   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     48   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     49   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
     50   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     51   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
     52   ret <4 x i32> %d
     53 }
     54 
     55 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
     56 ; SSE-LABEL: combine_pshufd4:
     57 ; SSE:       # BB#0: # %entry
     58 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
     59 ; SSE-NEXT:    retq
     60 ;
     61 ; AVX-LABEL: combine_pshufd4:
     62 ; AVX:       # BB#0: # %entry
     63 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
     64 ; AVX-NEXT:    retq
     65 entry:
     66   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
     67   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     68   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
     69   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     70   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
     71   ret <4 x i32> %d
     72 }
     73 
     74 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
     75 ; SSE-LABEL: combine_pshufd5:
     76 ; SSE:       # BB#0: # %entry
     77 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
     78 ; SSE-NEXT:    retq
     79 ;
     80 ; AVX-LABEL: combine_pshufd5:
     81 ; AVX:       # BB#0: # %entry
     82 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
     83 ; AVX-NEXT:    retq
     84 entry:
     85   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
     86   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     87   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
     88   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     89   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
     90   ret <4 x i32> %d
     91 }
     92 
     93 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
     94 ; SSE-LABEL: combine_pshufd6:
     95 ; SSE:       # BB#0: # %entry
     96 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
     97 ; SSE-NEXT:    retq
     98 ;
     99 ; AVX-LABEL: combine_pshufd6:
    100 ; AVX:       # BB#0: # %entry
    101 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    102 ; AVX-NEXT:    retq
    103 entry:
    104   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
    105   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
    106   ret <4 x i32> %c
    107 }
    108 
    109 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
    110 ; ALL-LABEL: combine_pshuflw1:
    111 ; ALL:       # BB#0: # %entry
    112 ; ALL-NEXT:    retq
    113 entry:
    114   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    115   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
    116   ret <8 x i16> %c
    117 }
    118 
    119 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
    120 ; ALL-LABEL: combine_pshuflw2:
    121 ; ALL:       # BB#0: # %entry
    122 ; ALL-NEXT:    retq
    123 entry:
    124   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    125   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
    126   %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
    127   ret <8 x i16> %d
    128 }
    129 
    130 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
    131 ; SSE-LABEL: combine_pshuflw3:
    132 ; SSE:       # BB#0: # %entry
    133 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    134 ; SSE-NEXT:    retq
    135 ;
    136 ; AVX-LABEL: combine_pshuflw3:
    137 ; AVX:       # BB#0: # %entry
    138 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    139 ; AVX-NEXT:    retq
    140 entry:
    141   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    142   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
    143   %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
    144   ret <8 x i16> %d
    145 }
    146 
    147 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
    148 ; SSE-LABEL: combine_pshufhw1:
    149 ; SSE:       # BB#0: # %entry
    150 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    151 ; SSE-NEXT:    retq
    152 ;
    153 ; AVX-LABEL: combine_pshufhw1:
    154 ; AVX:       # BB#0: # %entry
    155 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    156 ; AVX-NEXT:    retq
    157 entry:
    158   %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
    159   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
    160   %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
    161   ret <8 x i16> %d
    162 }
    163 
    164 define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    165 ; SSE-LABEL: combine_bitwise_ops_test1:
    166 ; SSE:       # BB#0:
    167 ; SSE-NEXT:    pand %xmm1, %xmm0
    168 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    169 ; SSE-NEXT:    retq
    170 ;
    171 ; AVX-LABEL: combine_bitwise_ops_test1:
    172 ; AVX:       # BB#0:
    173 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    174 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    175 ; AVX-NEXT:    retq
    176   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    177   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    178   %and = and <4 x i32> %shuf1, %shuf2
    179   ret <4 x i32> %and
    180 }
    181 
    182 define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    183 ; SSE-LABEL: combine_bitwise_ops_test2:
    184 ; SSE:       # BB#0:
    185 ; SSE-NEXT:    por %xmm1, %xmm0
    186 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    187 ; SSE-NEXT:    retq
    188 ;
    189 ; AVX-LABEL: combine_bitwise_ops_test2:
    190 ; AVX:       # BB#0:
    191 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
    192 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    193 ; AVX-NEXT:    retq
    194   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    195   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    196   %or = or <4 x i32> %shuf1, %shuf2
    197   ret <4 x i32> %or
    198 }
    199 
    200 define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    201 ; SSE-LABEL: combine_bitwise_ops_test3:
    202 ; SSE:       # BB#0:
    203 ; SSE-NEXT:    pxor %xmm1, %xmm0
    204 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    205 ; SSE-NEXT:    retq
    206 ;
    207 ; AVX-LABEL: combine_bitwise_ops_test3:
    208 ; AVX:       # BB#0:
    209 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    210 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    211 ; AVX-NEXT:    retq
    212   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    213   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    214   %xor = xor <4 x i32> %shuf1, %shuf2
    215   ret <4 x i32> %xor
    216 }
    217 
    218 define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    219 ; SSE-LABEL: combine_bitwise_ops_test4:
    220 ; SSE:       # BB#0:
    221 ; SSE-NEXT:    pand %xmm1, %xmm0
    222 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    223 ; SSE-NEXT:    retq
    224 ;
    225 ; AVX-LABEL: combine_bitwise_ops_test4:
    226 ; AVX:       # BB#0:
    227 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    228 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    229 ; AVX-NEXT:    retq
    230   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    231   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    232   %and = and <4 x i32> %shuf1, %shuf2
    233   ret <4 x i32> %and
    234 }
    235 
    236 define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    237 ; SSE-LABEL: combine_bitwise_ops_test5:
    238 ; SSE:       # BB#0:
    239 ; SSE-NEXT:    por %xmm1, %xmm0
    240 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    241 ; SSE-NEXT:    retq
    242 ;
    243 ; AVX-LABEL: combine_bitwise_ops_test5:
    244 ; AVX:       # BB#0:
    245 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
    246 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    247 ; AVX-NEXT:    retq
    248   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    249   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    250   %or = or <4 x i32> %shuf1, %shuf2
    251   ret <4 x i32> %or
    252 }
    253 
    254 define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    255 ; SSE-LABEL: combine_bitwise_ops_test6:
    256 ; SSE:       # BB#0:
    257 ; SSE-NEXT:    pxor %xmm1, %xmm0
    258 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    259 ; SSE-NEXT:    retq
    260 ;
    261 ; AVX-LABEL: combine_bitwise_ops_test6:
    262 ; AVX:       # BB#0:
    263 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    264 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    265 ; AVX-NEXT:    retq
    266   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    267   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    268   %xor = xor <4 x i32> %shuf1, %shuf2
    269   ret <4 x i32> %xor
    270 }
    271 
    272 
    273 ; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
    274 ; are not performing a swizzle operations.
    275 
    276 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    277 ; SSE2-LABEL: combine_bitwise_ops_test1b:
    278 ; SSE2:       # BB#0:
    279 ; SSE2-NEXT:    pand %xmm1, %xmm0
    280 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    281 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    282 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    283 ; SSE2-NEXT:    retq
    284 ;
    285 ; SSSE3-LABEL: combine_bitwise_ops_test1b:
    286 ; SSSE3:       # BB#0:
    287 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    288 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    289 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    290 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    291 ; SSSE3-NEXT:    retq
    292 ;
    293 ; SSE41-LABEL: combine_bitwise_ops_test1b:
    294 ; SSE41:       # BB#0:
    295 ; SSE41-NEXT:    pand %xmm1, %xmm0
    296 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    297 ; SSE41-NEXT:    retq
    298 ;
    299 ; AVX1-LABEL: combine_bitwise_ops_test1b:
    300 ; AVX1:       # BB#0:
    301 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    302 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    303 ; AVX1-NEXT:    retq
    304 ;
    305 ; AVX2-LABEL: combine_bitwise_ops_test1b:
    306 ; AVX2:       # BB#0:
    307 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    308 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    309 ; AVX2-NEXT:    retq
    310   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    311   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    312   %and = and <4 x i32> %shuf1, %shuf2
    313   ret <4 x i32> %and
    314 }
    315 
    316 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    317 ; SSE2-LABEL: combine_bitwise_ops_test2b:
    318 ; SSE2:       # BB#0:
    319 ; SSE2-NEXT:    por %xmm1, %xmm0
    320 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    321 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    322 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    323 ; SSE2-NEXT:    retq
    324 ;
    325 ; SSSE3-LABEL: combine_bitwise_ops_test2b:
    326 ; SSSE3:       # BB#0:
    327 ; SSSE3-NEXT:    por %xmm1, %xmm0
    328 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    329 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    330 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    331 ; SSSE3-NEXT:    retq
    332 ;
    333 ; SSE41-LABEL: combine_bitwise_ops_test2b:
    334 ; SSE41:       # BB#0:
    335 ; SSE41-NEXT:    por %xmm1, %xmm0
    336 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    337 ; SSE41-NEXT:    retq
    338 ;
    339 ; AVX1-LABEL: combine_bitwise_ops_test2b:
    340 ; AVX1:       # BB#0:
    341 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    342 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    343 ; AVX1-NEXT:    retq
    344 ;
    345 ; AVX2-LABEL: combine_bitwise_ops_test2b:
    346 ; AVX2:       # BB#0:
    347 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
    348 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    349 ; AVX2-NEXT:    retq
    350   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    351   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    352   %or = or <4 x i32> %shuf1, %shuf2
    353   ret <4 x i32> %or
    354 }
    355 
    356 define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    357 ; SSE2-LABEL: combine_bitwise_ops_test3b:
    358 ; SSE2:       # BB#0:
    359 ; SSE2-NEXT:    xorps %xmm1, %xmm0
    360 ; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
    361 ; SSE2-NEXT:    retq
    362 ;
    363 ; SSSE3-LABEL: combine_bitwise_ops_test3b:
    364 ; SSSE3:       # BB#0:
    365 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
    366 ; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
    367 ; SSSE3-NEXT:    retq
    368 ;
    369 ; SSE41-LABEL: combine_bitwise_ops_test3b:
    370 ; SSE41:       # BB#0:
    371 ; SSE41-NEXT:    pxor %xmm1, %xmm0
    372 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    373 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    374 ; SSE41-NEXT:    retq
    375 ;
    376 ; AVX1-LABEL: combine_bitwise_ops_test3b:
    377 ; AVX1:       # BB#0:
    378 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    379 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    380 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    381 ; AVX1-NEXT:    retq
    382 ;
    383 ; AVX2-LABEL: combine_bitwise_ops_test3b:
    384 ; AVX2:       # BB#0:
    385 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    386 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    387 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    388 ; AVX2-NEXT:    retq
    389   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    390   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    391   %xor = xor <4 x i32> %shuf1, %shuf2
    392   ret <4 x i32> %xor
    393 }
    394 
    395 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    396 ; SSE2-LABEL: combine_bitwise_ops_test4b:
    397 ; SSE2:       # BB#0:
    398 ; SSE2-NEXT:    pand %xmm1, %xmm0
    399 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    400 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    401 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    402 ; SSE2-NEXT:    retq
    403 ;
    404 ; SSSE3-LABEL: combine_bitwise_ops_test4b:
    405 ; SSSE3:       # BB#0:
    406 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    407 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    408 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    409 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    410 ; SSSE3-NEXT:    retq
    411 ;
    412 ; SSE41-LABEL: combine_bitwise_ops_test4b:
    413 ; SSE41:       # BB#0:
    414 ; SSE41-NEXT:    pand %xmm1, %xmm0
    415 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    416 ; SSE41-NEXT:    retq
    417 ;
    418 ; AVX1-LABEL: combine_bitwise_ops_test4b:
    419 ; AVX1:       # BB#0:
    420 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    421 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    422 ; AVX1-NEXT:    retq
    423 ;
    424 ; AVX2-LABEL: combine_bitwise_ops_test4b:
    425 ; AVX2:       # BB#0:
    426 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    427 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    428 ; AVX2-NEXT:    retq
    429   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    430   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    431   %and = and <4 x i32> %shuf1, %shuf2
    432   ret <4 x i32> %and
    433 }
    434 
    435 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    436 ; SSE2-LABEL: combine_bitwise_ops_test5b:
    437 ; SSE2:       # BB#0:
    438 ; SSE2-NEXT:    por %xmm1, %xmm0
    439 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    440 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    441 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    442 ; SSE2-NEXT:    retq
    443 ;
    444 ; SSSE3-LABEL: combine_bitwise_ops_test5b:
    445 ; SSSE3:       # BB#0:
    446 ; SSSE3-NEXT:    por %xmm1, %xmm0
    447 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    448 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    449 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    450 ; SSSE3-NEXT:    retq
    451 ;
    452 ; SSE41-LABEL: combine_bitwise_ops_test5b:
    453 ; SSE41:       # BB#0:
    454 ; SSE41-NEXT:    por %xmm1, %xmm0
    455 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    456 ; SSE41-NEXT:    retq
    457 ;
    458 ; AVX1-LABEL: combine_bitwise_ops_test5b:
    459 ; AVX1:       # BB#0:
    460 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    461 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    462 ; AVX1-NEXT:    retq
    463 ;
    464 ; AVX2-LABEL: combine_bitwise_ops_test5b:
    465 ; AVX2:       # BB#0:
    466 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
    467 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    468 ; AVX2-NEXT:    retq
    469   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    470   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    471   %or = or <4 x i32> %shuf1, %shuf2
    472   ret <4 x i32> %or
    473 }
    474 
    475 define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    476 ; SSE2-LABEL: combine_bitwise_ops_test6b:
    477 ; SSE2:       # BB#0:
    478 ; SSE2-NEXT:    xorps %xmm1, %xmm0
    479 ; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
    480 ; SSE2-NEXT:    retq
    481 ;
    482 ; SSSE3-LABEL: combine_bitwise_ops_test6b:
    483 ; SSSE3:       # BB#0:
    484 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
    485 ; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
    486 ; SSSE3-NEXT:    retq
    487 ;
    488 ; SSE41-LABEL: combine_bitwise_ops_test6b:
    489 ; SSE41:       # BB#0:
    490 ; SSE41-NEXT:    pxor %xmm1, %xmm0
    491 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    492 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
    493 ; SSE41-NEXT:    retq
    494 ;
    495 ; AVX1-LABEL: combine_bitwise_ops_test6b:
    496 ; AVX1:       # BB#0:
    497 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    498 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    499 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
    500 ; AVX1-NEXT:    retq
    501 ;
    502 ; AVX2-LABEL: combine_bitwise_ops_test6b:
    503 ; AVX2:       # BB#0:
    504 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    505 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    506 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
    507 ; AVX2-NEXT:    retq
    508   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    509   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    510   %xor = xor <4 x i32> %shuf1, %shuf2
    511   ret <4 x i32> %xor
    512 }
    513 
    514 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    515 ; SSE2-LABEL: combine_bitwise_ops_test1c:
    516 ; SSE2:       # BB#0:
    517 ; SSE2-NEXT:    pand %xmm1, %xmm0
    518 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    519 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    520 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    521 ; SSE2-NEXT:    retq
    522 ;
    523 ; SSSE3-LABEL: combine_bitwise_ops_test1c:
    524 ; SSSE3:       # BB#0:
    525 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    526 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    527 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    528 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    529 ; SSSE3-NEXT:    retq
    530 ;
    531 ; SSE41-LABEL: combine_bitwise_ops_test1c:
    532 ; SSE41:       # BB#0:
    533 ; SSE41-NEXT:    pand %xmm1, %xmm0
    534 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    535 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    536 ; SSE41-NEXT:    retq
    537 ;
    538 ; AVX1-LABEL: combine_bitwise_ops_test1c:
    539 ; AVX1:       # BB#0:
    540 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    541 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    542 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    543 ; AVX1-NEXT:    retq
    544 ;
    545 ; AVX2-LABEL: combine_bitwise_ops_test1c:
    546 ; AVX2:       # BB#0:
    547 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    548 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    549 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    550 ; AVX2-NEXT:    retq
    551   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    552   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    553   %and = and <4 x i32> %shuf1, %shuf2
    554   ret <4 x i32> %and
    555 }
    556 
    557 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    558 ; SSE2-LABEL: combine_bitwise_ops_test2c:
    559 ; SSE2:       # BB#0:
    560 ; SSE2-NEXT:    por %xmm1, %xmm0
    561 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    562 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    563 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    564 ; SSE2-NEXT:    retq
    565 ;
    566 ; SSSE3-LABEL: combine_bitwise_ops_test2c:
    567 ; SSSE3:       # BB#0:
    568 ; SSSE3-NEXT:    por %xmm1, %xmm0
    569 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    570 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    571 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    572 ; SSSE3-NEXT:    retq
    573 ;
    574 ; SSE41-LABEL: combine_bitwise_ops_test2c:
    575 ; SSE41:       # BB#0:
    576 ; SSE41-NEXT:    por %xmm1, %xmm0
    577 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    578 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    579 ; SSE41-NEXT:    retq
    580 ;
    581 ; AVX1-LABEL: combine_bitwise_ops_test2c:
    582 ; AVX1:       # BB#0:
    583 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    584 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    585 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    586 ; AVX1-NEXT:    retq
    587 ;
    588 ; AVX2-LABEL: combine_bitwise_ops_test2c:
    589 ; AVX2:       # BB#0:
    590 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
    591 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    592 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    593 ; AVX2-NEXT:    retq
    594   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    595   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    596   %or = or <4 x i32> %shuf1, %shuf2
    597   ret <4 x i32> %or
    598 }
    599 
    600 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    601 ; SSE2-LABEL: combine_bitwise_ops_test3c:
    602 ; SSE2:       # BB#0:
    603 ; SSE2-NEXT:    pxor %xmm1, %xmm0
    604 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
    605 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    606 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
    607 ; SSE2-NEXT:    retq
    608 ;
    609 ; SSSE3-LABEL: combine_bitwise_ops_test3c:
    610 ; SSSE3:       # BB#0:
    611 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
    612 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
    613 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    614 ; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
    615 ; SSSE3-NEXT:    retq
    616 ;
    617 ; SSE41-LABEL: combine_bitwise_ops_test3c:
    618 ; SSE41:       # BB#0:
    619 ; SSE41-NEXT:    pxor %xmm1, %xmm0
    620 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    621 ; SSE41-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    622 ; SSE41-NEXT:    retq
    623 ;
    624 ; AVX-LABEL: combine_bitwise_ops_test3c:
    625 ; AVX:       # BB#0:
    626 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    627 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    628 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
    629 ; AVX-NEXT:    retq
    630   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    631   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    632   %xor = xor <4 x i32> %shuf1, %shuf2
    633   ret <4 x i32> %xor
    634 }
    635 
    636 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    637 ; SSE2-LABEL: combine_bitwise_ops_test4c:
    638 ; SSE2:       # BB#0:
    639 ; SSE2-NEXT:    pand %xmm1, %xmm0
    640 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    641 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    642 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    643 ; SSE2-NEXT:    retq
    644 ;
    645 ; SSSE3-LABEL: combine_bitwise_ops_test4c:
    646 ; SSSE3:       # BB#0:
    647 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    648 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    649 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    650 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    651 ; SSSE3-NEXT:    retq
    652 ;
    653 ; SSE41-LABEL: combine_bitwise_ops_test4c:
    654 ; SSE41:       # BB#0:
    655 ; SSE41-NEXT:    pand %xmm1, %xmm0
    656 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    657 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    658 ; SSE41-NEXT:    retq
    659 ;
    660 ; AVX1-LABEL: combine_bitwise_ops_test4c:
    661 ; AVX1:       # BB#0:
    662 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    663 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    664 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    665 ; AVX1-NEXT:    retq
    666 ;
    667 ; AVX2-LABEL: combine_bitwise_ops_test4c:
    668 ; AVX2:       # BB#0:
    669 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    670 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    671 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    672 ; AVX2-NEXT:    retq
    673   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    674   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    675   %and = and <4 x i32> %shuf1, %shuf2
    676   ret <4 x i32> %and
    677 }
    678 
    679 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    680 ; SSE2-LABEL: combine_bitwise_ops_test5c:
    681 ; SSE2:       # BB#0:
    682 ; SSE2-NEXT:    por %xmm1, %xmm0
    683 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    684 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    685 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    686 ; SSE2-NEXT:    retq
    687 ;
    688 ; SSSE3-LABEL: combine_bitwise_ops_test5c:
    689 ; SSSE3:       # BB#0:
    690 ; SSSE3-NEXT:    por %xmm1, %xmm0
    691 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    692 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    693 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    694 ; SSSE3-NEXT:    retq
    695 ;
    696 ; SSE41-LABEL: combine_bitwise_ops_test5c:
    697 ; SSE41:       # BB#0:
    698 ; SSE41-NEXT:    por %xmm1, %xmm0
    699 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    700 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    701 ; SSE41-NEXT:    retq
    702 ;
    703 ; AVX1-LABEL: combine_bitwise_ops_test5c:
    704 ; AVX1:       # BB#0:
    705 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    706 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    707 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    708 ; AVX1-NEXT:    retq
    709 ;
    710 ; AVX2-LABEL: combine_bitwise_ops_test5c:
    711 ; AVX2:       # BB#0:
    712 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
    713 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    714 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    715 ; AVX2-NEXT:    retq
    716   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    717   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    718   %or = or <4 x i32> %shuf1, %shuf2
    719   ret <4 x i32> %or
    720 }
    721 
    722 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    723 ; SSE2-LABEL: combine_bitwise_ops_test6c:
    724 ; SSE2:       # BB#0:
    725 ; SSE2-NEXT:    pxor %xmm1, %xmm0
    726 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    727 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    728 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    729 ; SSE2-NEXT:    retq
    730 ;
    731 ; SSSE3-LABEL: combine_bitwise_ops_test6c:
    732 ; SSSE3:       # BB#0:
    733 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
    734 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    735 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
    736 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    737 ; SSSE3-NEXT:    retq
    738 ;
    739 ; SSE41-LABEL: combine_bitwise_ops_test6c:
    740 ; SSE41:       # BB#0:
    741 ; SSE41-NEXT:    pxor %xmm1, %xmm0
    742 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
    743 ; SSE41-NEXT:    pxor %xmm0, %xmm0
    744 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    745 ; SSE41-NEXT:    retq
    746 ;
    747 ; AVX1-LABEL: combine_bitwise_ops_test6c:
    748 ; AVX1:       # BB#0:
    749 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    750 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    751 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    752 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
    753 ; AVX1-NEXT:    retq
    754 ;
    755 ; AVX2-LABEL: combine_bitwise_ops_test6c:
    756 ; AVX2:       # BB#0:
    757 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    758 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    759 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    760 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
    761 ; AVX2-NEXT:    retq
    762   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    763   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    764   %xor = xor <4 x i32> %shuf1, %shuf2
    765   ret <4 x i32> %xor
    766 }
    767 
    768 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
    769 ; SSE-LABEL: combine_nested_undef_test1:
    770 ; SSE:       # BB#0:
    771 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
    772 ; SSE-NEXT:    retq
    773 ;
    774 ; AVX-LABEL: combine_nested_undef_test1:
    775 ; AVX:       # BB#0:
    776 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
    777 ; AVX-NEXT:    retq
    778   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
    779   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    780   ret <4 x i32> %2
    781 }
    782 
    783 define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
    784 ; SSE-LABEL: combine_nested_undef_test2:
    785 ; SSE:       # BB#0:
    786 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    787 ; SSE-NEXT:    retq
    788 ;
    789 ; AVX-LABEL: combine_nested_undef_test2:
    790 ; AVX:       # BB#0:
    791 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    792 ; AVX-NEXT:    retq
    793   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
    794   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    795   ret <4 x i32> %2
    796 }
    797 
    798 define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
    799 ; SSE-LABEL: combine_nested_undef_test3:
    800 ; SSE:       # BB#0:
    801 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    802 ; SSE-NEXT:    retq
    803 ;
    804 ; AVX-LABEL: combine_nested_undef_test3:
    805 ; AVX:       # BB#0:
    806 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    807 ; AVX-NEXT:    retq
    808   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
    809   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    810   ret <4 x i32> %2
    811 }
    812 
    813 define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
    814 ; SSE-LABEL: combine_nested_undef_test4:
    815 ; SSE:       # BB#0:
    816 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    817 ; SSE-NEXT:    retq
    818 ;
    819 ; AVX1-LABEL: combine_nested_undef_test4:
    820 ; AVX1:       # BB#0:
    821 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    822 ; AVX1-NEXT:    retq
    823 ;
    824 ; AVX2-LABEL: combine_nested_undef_test4:
    825 ; AVX2:       # BB#0:
    826 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
    827 ; AVX2-NEXT:    retq
    828   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
    829   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
    830   ret <4 x i32> %2
    831 }
    832 
    833 define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
    834 ; SSE-LABEL: combine_nested_undef_test5:
    835 ; SSE:       # BB#0:
    836 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
    837 ; SSE-NEXT:    retq
    838 ;
    839 ; AVX-LABEL: combine_nested_undef_test5:
    840 ; AVX:       # BB#0:
    841 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
    842 ; AVX-NEXT:    retq
    843   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
    844   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
    845   ret <4 x i32> %2
    846 }
    847 
    848 define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
    849 ; SSE-LABEL: combine_nested_undef_test6:
    850 ; SSE:       # BB#0:
    851 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    852 ; SSE-NEXT:    retq
    853 ;
    854 ; AVX-LABEL: combine_nested_undef_test6:
    855 ; AVX:       # BB#0:
    856 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    857 ; AVX-NEXT:    retq
    858   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
    859   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
    860   ret <4 x i32> %2
    861 }
    862 
    863 define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
    864 ; SSE-LABEL: combine_nested_undef_test7:
    865 ; SSE:       # BB#0:
    866 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
    867 ; SSE-NEXT:    retq
    868 ;
    869 ; AVX-LABEL: combine_nested_undef_test7:
    870 ; AVX:       # BB#0:
    871 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
    872 ; AVX-NEXT:    retq
    873   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    874   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
    875   ret <4 x i32> %2
    876 }
    877 
    878 define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
    879 ; SSE-LABEL: combine_nested_undef_test8:
    880 ; SSE:       # BB#0:
    881 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
    882 ; SSE-NEXT:    retq
    883 ;
    884 ; AVX-LABEL: combine_nested_undef_test8:
    885 ; AVX:       # BB#0:
    886 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
    887 ; AVX-NEXT:    retq
    888   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
    889   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
    890   ret <4 x i32> %2
    891 }
    892 
    893 define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
    894 ; SSE-LABEL: combine_nested_undef_test9:
    895 ; SSE:       # BB#0:
    896 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
    897 ; SSE-NEXT:    retq
    898 ;
    899 ; AVX-LABEL: combine_nested_undef_test9:
    900 ; AVX:       # BB#0:
    901 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
    902 ; AVX-NEXT:    retq
    903   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
    904   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
    905   ret <4 x i32> %2
    906 }
    907 
    908 define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
    909 ; SSE-LABEL: combine_nested_undef_test10:
    910 ; SSE:       # BB#0:
    911 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
    912 ; SSE-NEXT:    retq
    913 ;
    914 ; AVX-LABEL: combine_nested_undef_test10:
    915 ; AVX:       # BB#0:
    916 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
    917 ; AVX-NEXT:    retq
    918   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
    919   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
    920   ret <4 x i32> %2
    921 }
    922 
    923 define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
    924 ; SSE-LABEL: combine_nested_undef_test11:
    925 ; SSE:       # BB#0:
    926 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
    927 ; SSE-NEXT:    retq
    928 ;
    929 ; AVX-LABEL: combine_nested_undef_test11:
    930 ; AVX:       # BB#0:
    931 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
    932 ; AVX-NEXT:    retq
    933   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
    934   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
    935   ret <4 x i32> %2
    936 }
    937 
    938 define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
    939 ; SSE-LABEL: combine_nested_undef_test12:
    940 ; SSE:       # BB#0:
    941 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    942 ; SSE-NEXT:    retq
    943 ;
    944 ; AVX1-LABEL: combine_nested_undef_test12:
    945 ; AVX1:       # BB#0:
    946 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    947 ; AVX1-NEXT:    retq
    948 ;
    949 ; AVX2-LABEL: combine_nested_undef_test12:
    950 ; AVX2:       # BB#0:
    951 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
    952 ; AVX2-NEXT:    retq
    953   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
    954   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
    955   ret <4 x i32> %2
    956 }
    957 
    958 ; The following pair of shuffles is folded into vector %A.
    959 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
    960 ; ALL-LABEL: combine_nested_undef_test13:
    961 ; ALL:       # BB#0:
    962 ; ALL-NEXT:    retq
    963   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
    964   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
    965   ret <4 x i32> %2
    966 }
    967 
    968 ; The following pair of shuffles is folded into vector %B.
    969 define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
    970 ; SSE-LABEL: combine_nested_undef_test14:
    971 ; SSE:       # BB#0:
    972 ; SSE-NEXT:    movaps %xmm1, %xmm0
    973 ; SSE-NEXT:    retq
    974 ;
    975 ; AVX-LABEL: combine_nested_undef_test14:
    976 ; AVX:       # BB#0:
    977 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
    978 ; AVX-NEXT:    retq
    979   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
    980   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
    981   ret <4 x i32> %2
    982 }
    983 
    984 
    985 ; Verify that we don't optimize the following cases. We expect more than one shuffle.
    986 ;
    987 ; FIXME: Many of these already don't make sense, and the rest should stop
    988 ; making sense with th enew vector shuffle lowering. Revisit at least testing for
    989 ; it.
    990 
    991 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
    992 ; SSE2-LABEL: combine_nested_undef_test15:
    993 ; SSE2:       # BB#0:
    994 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
    995 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
    996 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    997 ; SSE2-NEXT:    retq
    998 ;
    999 ; SSSE3-LABEL: combine_nested_undef_test15:
   1000 ; SSSE3:       # BB#0:
   1001 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
   1002 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
   1003 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1004 ; SSSE3-NEXT:    retq
   1005 ;
   1006 ; SSE41-LABEL: combine_nested_undef_test15:
   1007 ; SSE41:       # BB#0:
   1008 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
   1009 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1010 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1011 ; SSE41-NEXT:    retq
   1012 ;
   1013 ; AVX1-LABEL: combine_nested_undef_test15:
   1014 ; AVX1:       # BB#0:
   1015 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
   1016 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1017 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1018 ; AVX1-NEXT:    retq
   1019 ;
   1020 ; AVX2-LABEL: combine_nested_undef_test15:
   1021 ; AVX2:       # BB#0:
   1022 ; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
   1023 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1024 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1025 ; AVX2-NEXT:    retq
   1026   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
   1027   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1028   ret <4 x i32> %2
   1029 }
   1030 
   1031 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
   1032 ; SSE2-LABEL: combine_nested_undef_test16:
   1033 ; SSE2:       # BB#0:
   1034 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
   1035 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
   1036 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1037 ; SSE2-NEXT:    retq
   1038 ;
   1039 ; SSSE3-LABEL: combine_nested_undef_test16:
   1040 ; SSSE3:       # BB#0:
   1041 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
   1042 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
   1043 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1044 ; SSSE3-NEXT:    retq
   1045 ;
   1046 ; SSE41-LABEL: combine_nested_undef_test16:
   1047 ; SSE41:       # BB#0:
   1048 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1049 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
   1050 ; SSE41-NEXT:    retq
   1051 ;
   1052 ; AVX1-LABEL: combine_nested_undef_test16:
   1053 ; AVX1:       # BB#0:
   1054 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1055 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
   1056 ; AVX1-NEXT:    retq
   1057 ;
   1058 ; AVX2-LABEL: combine_nested_undef_test16:
   1059 ; AVX2:       # BB#0:
   1060 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1061 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
   1062 ; AVX2-NEXT:    retq
   1063   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1064   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1065   ret <4 x i32> %2
   1066 }
   1067 
   1068 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
   1069 ; SSE2-LABEL: combine_nested_undef_test17:
   1070 ; SSE2:       # BB#0:
   1071 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
   1072 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
   1073 ; SSE2-NEXT:    retq
   1074 ;
   1075 ; SSSE3-LABEL: combine_nested_undef_test17:
   1076 ; SSSE3:       # BB#0:
   1077 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
   1078 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
   1079 ; SSSE3-NEXT:    retq
   1080 ;
   1081 ; SSE41-LABEL: combine_nested_undef_test17:
   1082 ; SSE41:       # BB#0:
   1083 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
   1084 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1085 ; SSE41-NEXT:    retq
   1086 ;
   1087 ; AVX1-LABEL: combine_nested_undef_test17:
   1088 ; AVX1:       # BB#0:
   1089 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
   1090 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1091 ; AVX1-NEXT:    retq
   1092 ;
   1093 ; AVX2-LABEL: combine_nested_undef_test17:
   1094 ; AVX2:       # BB#0:
   1095 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1096 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1097 ; AVX2-NEXT:    retq
   1098   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
   1099   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1100   ret <4 x i32> %2
   1101 }
   1102 
   1103 define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
   1104 ; SSE-LABEL: combine_nested_undef_test18:
   1105 ; SSE:       # BB#0:
   1106 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
   1107 ; SSE-NEXT:    retq
   1108 ;
   1109 ; AVX-LABEL: combine_nested_undef_test18:
   1110 ; AVX:       # BB#0:
   1111 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
   1112 ; AVX-NEXT:    retq
   1113   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
   1114   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   1115   ret <4 x i32> %2
   1116 }
   1117 
   1118 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
   1119 ; SSE2-LABEL: combine_nested_undef_test19:
   1120 ; SSE2:       # BB#0:
   1121 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1122 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
   1123 ; SSE2-NEXT:    retq
   1124 ;
   1125 ; SSSE3-LABEL: combine_nested_undef_test19:
   1126 ; SSSE3:       # BB#0:
   1127 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1128 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
   1129 ; SSSE3-NEXT:    retq
   1130 ;
   1131 ; SSE41-LABEL: combine_nested_undef_test19:
   1132 ; SSE41:       # BB#0:
   1133 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1134 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
   1135 ; SSE41-NEXT:    retq
   1136 ;
   1137 ; AVX1-LABEL: combine_nested_undef_test19:
   1138 ; AVX1:       # BB#0:
   1139 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1140 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
   1141 ; AVX1-NEXT:    retq
   1142 ;
   1143 ; AVX2-LABEL: combine_nested_undef_test19:
   1144 ; AVX2:       # BB#0:
   1145 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1146 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
   1147 ; AVX2-NEXT:    retq
   1148   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   1149   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
   1150   ret <4 x i32> %2
   1151 }
   1152 
   1153 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
   1154 ; SSE2-LABEL: combine_nested_undef_test20:
   1155 ; SSE2:       # BB#0:
   1156 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
   1157 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
   1158 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1159 ; SSE2-NEXT:    retq
   1160 ;
   1161 ; SSSE3-LABEL: combine_nested_undef_test20:
   1162 ; SSSE3:       # BB#0:
   1163 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
   1164 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
   1165 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1166 ; SSSE3-NEXT:    retq
   1167 ;
   1168 ; SSE41-LABEL: combine_nested_undef_test20:
   1169 ; SSE41:       # BB#0:
   1170 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
   1171 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
   1172 ; SSE41-NEXT:    retq
   1173 ;
   1174 ; AVX1-LABEL: combine_nested_undef_test20:
   1175 ; AVX1:       # BB#0:
   1176 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
   1177 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
   1178 ; AVX1-NEXT:    retq
   1179 ;
   1180 ; AVX2-LABEL: combine_nested_undef_test20:
   1181 ; AVX2:       # BB#0:
   1182 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   1183 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
   1184 ; AVX2-NEXT:    retq
   1185   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
   1186   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1187   ret <4 x i32> %2
   1188 }
   1189 
   1190 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
   1191 ; SSE2-LABEL: combine_nested_undef_test21:
   1192 ; SSE2:       # BB#0:
   1193 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1194 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
   1195 ; SSE2-NEXT:    retq
   1196 ;
   1197 ; SSSE3-LABEL: combine_nested_undef_test21:
   1198 ; SSSE3:       # BB#0:
   1199 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1200 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
   1201 ; SSSE3-NEXT:    retq
   1202 ;
   1203 ; SSE41-LABEL: combine_nested_undef_test21:
   1204 ; SSE41:       # BB#0:
   1205 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1206 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1207 ; SSE41-NEXT:    retq
   1208 ;
   1209 ; AVX1-LABEL: combine_nested_undef_test21:
   1210 ; AVX1:       # BB#0:
   1211 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1212 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1213 ; AVX1-NEXT:    retq
   1214 ;
   1215 ; AVX2-LABEL: combine_nested_undef_test21:
   1216 ; AVX2:       # BB#0:
   1217 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1218 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1219 ; AVX2-NEXT:    retq
   1220   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
   1221   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
   1222   ret <4 x i32> %2
   1223 }
   1224 
   1225 
   1226 ; Test that we correctly combine shuffles according to rule
   1227 ;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
   1228 
   1229 define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
   1230 ; SSE-LABEL: combine_nested_undef_test22:
   1231 ; SSE:       # BB#0:
   1232 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
   1233 ; SSE-NEXT:    retq
   1234 ;
   1235 ; AVX-LABEL: combine_nested_undef_test22:
   1236 ; AVX:       # BB#0:
   1237 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
   1238 ; AVX-NEXT:    retq
   1239   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
   1240   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
   1241   ret <4 x i32> %2
   1242 }
   1243 
   1244 define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
   1245 ; SSE-LABEL: combine_nested_undef_test23:
   1246 ; SSE:       # BB#0:
   1247 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
   1248 ; SSE-NEXT:    retq
   1249 ;
   1250 ; AVX-LABEL: combine_nested_undef_test23:
   1251 ; AVX:       # BB#0:
   1252 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
   1253 ; AVX-NEXT:    retq
   1254   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
   1255   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
   1256   ret <4 x i32> %2
   1257 }
   1258 
   1259 define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
   1260 ; SSE-LABEL: combine_nested_undef_test24:
   1261 ; SSE:       # BB#0:
   1262 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
   1263 ; SSE-NEXT:    retq
   1264 ;
   1265 ; AVX-LABEL: combine_nested_undef_test24:
   1266 ; AVX:       # BB#0:
   1267 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
   1268 ; AVX-NEXT:    retq
   1269   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1270   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
   1271   ret <4 x i32> %2
   1272 }
   1273 
   1274 define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
   1275 ; SSE-LABEL: combine_nested_undef_test25:
   1276 ; SSE:       # BB#0:
   1277 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1278 ; SSE-NEXT:    retq
   1279 ;
   1280 ; AVX1-LABEL: combine_nested_undef_test25:
   1281 ; AVX1:       # BB#0:
   1282 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1283 ; AVX1-NEXT:    retq
   1284 ;
   1285 ; AVX2-LABEL: combine_nested_undef_test25:
   1286 ; AVX2:       # BB#0:
   1287 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1288 ; AVX2-NEXT:    retq
   1289   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
   1290   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
   1291   ret <4 x i32> %2
   1292 }
   1293 
   1294 define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
   1295 ; SSE-LABEL: combine_nested_undef_test26:
   1296 ; SSE:       # BB#0:
   1297 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
   1298 ; SSE-NEXT:    retq
   1299 ;
   1300 ; AVX-LABEL: combine_nested_undef_test26:
   1301 ; AVX:       # BB#0:
   1302 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
   1303 ; AVX-NEXT:    retq
   1304   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
   1305   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
   1306   ret <4 x i32> %2
   1307 }
   1308 
   1309 define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
   1310 ; SSE-LABEL: combine_nested_undef_test27:
   1311 ; SSE:       # BB#0:
   1312 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1313 ; SSE-NEXT:    retq
   1314 ;
   1315 ; AVX1-LABEL: combine_nested_undef_test27:
   1316 ; AVX1:       # BB#0:
   1317 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1318 ; AVX1-NEXT:    retq
   1319 ;
   1320 ; AVX2-LABEL: combine_nested_undef_test27:
   1321 ; AVX2:       # BB#0:
   1322 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1323 ; AVX2-NEXT:    retq
   1324   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
   1325   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   1326   ret <4 x i32> %2
   1327 }
   1328 
   1329 define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
   1330 ; SSE-LABEL: combine_nested_undef_test28:
   1331 ; SSE:       # BB#0:
   1332 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
   1333 ; SSE-NEXT:    retq
   1334 ;
   1335 ; AVX-LABEL: combine_nested_undef_test28:
   1336 ; AVX:       # BB#0:
   1337 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
   1338 ; AVX-NEXT:    retq
   1339   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   1340   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
   1341   ret <4 x i32> %2
   1342 }
   1343 
   1344 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
   1345 ; SSE-LABEL: combine_test1:
   1346 ; SSE:       # BB#0:
   1347 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1348 ; SSE-NEXT:    retq
   1349 ;
   1350 ; AVX-LABEL: combine_test1:
   1351 ; AVX:       # BB#0:
   1352 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
   1353 ; AVX-NEXT:    retq
   1354   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1355   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1356   ret <4 x float> %2
   1357 }
   1358 
   1359 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
   1360 ; SSE2-LABEL: combine_test2:
   1361 ; SSE2:       # BB#0:
   1362 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1363 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1364 ; SSE2-NEXT:    retq
   1365 ;
   1366 ; SSSE3-LABEL: combine_test2:
   1367 ; SSSE3:       # BB#0:
   1368 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1369 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1370 ; SSSE3-NEXT:    retq
   1371 ;
   1372 ; SSE41-LABEL: combine_test2:
   1373 ; SSE41:       # BB#0:
   1374 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1375 ; SSE41-NEXT:    retq
   1376 ;
   1377 ; AVX-LABEL: combine_test2:
   1378 ; AVX:       # BB#0:
   1379 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1380 ; AVX-NEXT:    retq
   1381   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1382   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1383   ret <4 x float> %2
   1384 }
   1385 
   1386 define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
   1387 ; SSE-LABEL: combine_test3:
   1388 ; SSE:       # BB#0:
   1389 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1390 ; SSE-NEXT:    retq
   1391 ;
   1392 ; AVX-LABEL: combine_test3:
   1393 ; AVX:       # BB#0:
   1394 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1395 ; AVX-NEXT:    retq
   1396   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   1397   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1398   ret <4 x float> %2
   1399 }
   1400 
   1401 define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
   1402 ; SSE-LABEL: combine_test4:
   1403 ; SSE:       # BB#0:
   1404 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   1405 ; SSE-NEXT:    movapd %xmm1, %xmm0
   1406 ; SSE-NEXT:    retq
   1407 ;
   1408 ; AVX-LABEL: combine_test4:
   1409 ; AVX:       # BB#0:
   1410 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1411 ; AVX-NEXT:    retq
   1412   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   1413   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1414   ret <4 x float> %2
   1415 }
   1416 
   1417 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
   1418 ; SSE2-LABEL: combine_test5:
   1419 ; SSE2:       # BB#0:
   1420 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1421 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1422 ; SSE2-NEXT:    retq
   1423 ;
   1424 ; SSSE3-LABEL: combine_test5:
   1425 ; SSSE3:       # BB#0:
   1426 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1427 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1428 ; SSSE3-NEXT:    retq
   1429 ;
   1430 ; SSE41-LABEL: combine_test5:
   1431 ; SSE41:       # BB#0:
   1432 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1433 ; SSE41-NEXT:    retq
   1434 ;
   1435 ; AVX-LABEL: combine_test5:
   1436 ; AVX:       # BB#0:
   1437 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1438 ; AVX-NEXT:    retq
   1439   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1440   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1441   ret <4 x float> %2
   1442 }
   1443 
   1444 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
   1445 ; SSE-LABEL: combine_test6:
   1446 ; SSE:       # BB#0:
   1447 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1448 ; SSE-NEXT:    retq
   1449 ;
   1450 ; AVX-LABEL: combine_test6:
   1451 ; AVX:       # BB#0:
   1452 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
   1453 ; AVX-NEXT:    retq
   1454   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1455   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1456   ret <4 x i32> %2
   1457 }
   1458 
   1459 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
   1460 ; SSE2-LABEL: combine_test7:
   1461 ; SSE2:       # BB#0:
   1462 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1463 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1464 ; SSE2-NEXT:    retq
   1465 ;
   1466 ; SSSE3-LABEL: combine_test7:
   1467 ; SSSE3:       # BB#0:
   1468 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1469 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1470 ; SSSE3-NEXT:    retq
   1471 ;
   1472 ; SSE41-LABEL: combine_test7:
   1473 ; SSE41:       # BB#0:
   1474 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1475 ; SSE41-NEXT:    retq
   1476 ;
   1477 ; AVX1-LABEL: combine_test7:
   1478 ; AVX1:       # BB#0:
   1479 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1480 ; AVX1-NEXT:    retq
   1481 ;
   1482 ; AVX2-LABEL: combine_test7:
   1483 ; AVX2:       # BB#0:
   1484 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1485 ; AVX2-NEXT:    retq
   1486   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1487   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1488   ret <4 x i32> %2
   1489 }
   1490 
   1491 define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
   1492 ; SSE-LABEL: combine_test8:
   1493 ; SSE:       # BB#0:
   1494 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1495 ; SSE-NEXT:    retq
   1496 ;
   1497 ; AVX-LABEL: combine_test8:
   1498 ; AVX:       # BB#0:
   1499 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1500 ; AVX-NEXT:    retq
   1501   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   1502   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1503   ret <4 x i32> %2
   1504 }
   1505 
   1506 define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
   1507 ; SSE-LABEL: combine_test9:
   1508 ; SSE:       # BB#0:
   1509 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   1510 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   1511 ; SSE-NEXT:    retq
   1512 ;
   1513 ; AVX-LABEL: combine_test9:
   1514 ; AVX:       # BB#0:
   1515 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1516 ; AVX-NEXT:    retq
   1517   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   1518   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1519   ret <4 x i32> %2
   1520 }
   1521 
   1522 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
   1523 ; SSE2-LABEL: combine_test10:
   1524 ; SSE2:       # BB#0:
   1525 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1526 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1527 ; SSE2-NEXT:    retq
   1528 ;
   1529 ; SSSE3-LABEL: combine_test10:
   1530 ; SSSE3:       # BB#0:
   1531 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1532 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1533 ; SSSE3-NEXT:    retq
   1534 ;
   1535 ; SSE41-LABEL: combine_test10:
   1536 ; SSE41:       # BB#0:
   1537 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1538 ; SSE41-NEXT:    retq
   1539 ;
   1540 ; AVX1-LABEL: combine_test10:
   1541 ; AVX1:       # BB#0:
   1542 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1543 ; AVX1-NEXT:    retq
   1544 ;
   1545 ; AVX2-LABEL: combine_test10:
   1546 ; AVX2:       # BB#0:
   1547 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1548 ; AVX2-NEXT:    retq
   1549   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1550   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1551   ret <4 x i32> %2
   1552 }
   1553 
   1554 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
   1555 ; ALL-LABEL: combine_test11:
   1556 ; ALL:       # BB#0:
   1557 ; ALL-NEXT:    retq
   1558   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1559   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1560   ret <4 x float> %2
   1561 }
   1562 
   1563 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
   1564 ; SSE2-LABEL: combine_test12:
   1565 ; SSE2:       # BB#0:
   1566 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1567 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1568 ; SSE2-NEXT:    retq
   1569 ;
   1570 ; SSSE3-LABEL: combine_test12:
   1571 ; SSSE3:       # BB#0:
   1572 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1573 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1574 ; SSSE3-NEXT:    retq
   1575 ;
   1576 ; SSE41-LABEL: combine_test12:
   1577 ; SSE41:       # BB#0:
   1578 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1579 ; SSE41-NEXT:    retq
   1580 ;
   1581 ; AVX-LABEL: combine_test12:
   1582 ; AVX:       # BB#0:
   1583 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1584 ; AVX-NEXT:    retq
   1585   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1586   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   1587   ret <4 x float> %2
   1588 }
   1589 
   1590 define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
   1591 ; SSE-LABEL: combine_test13:
   1592 ; SSE:       # BB#0:
   1593 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1594 ; SSE-NEXT:    retq
   1595 ;
   1596 ; AVX-LABEL: combine_test13:
   1597 ; AVX:       # BB#0:
   1598 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1599 ; AVX-NEXT:    retq
   1600   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1601   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1602   ret <4 x float> %2
   1603 }
   1604 
   1605 define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
   1606 ; SSE-LABEL: combine_test14:
   1607 ; SSE:       # BB#0:
   1608 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1609 ; SSE-NEXT:    retq
   1610 ;
   1611 ; AVX-LABEL: combine_test14:
   1612 ; AVX:       # BB#0:
   1613 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1614 ; AVX-NEXT:    retq
   1615   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
   1616   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1617   ret <4 x float> %2
   1618 }
   1619 
   1620 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
   1621 ; SSE2-LABEL: combine_test15:
   1622 ; SSE2:       # BB#0:
   1623 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1624 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1625 ; SSE2-NEXT:    retq
   1626 ;
   1627 ; SSSE3-LABEL: combine_test15:
   1628 ; SSSE3:       # BB#0:
   1629 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1630 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1631 ; SSSE3-NEXT:    retq
   1632 ;
   1633 ; SSE41-LABEL: combine_test15:
   1634 ; SSE41:       # BB#0:
   1635 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1636 ; SSE41-NEXT:    retq
   1637 ;
   1638 ; AVX-LABEL: combine_test15:
   1639 ; AVX:       # BB#0:
   1640 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1641 ; AVX-NEXT:    retq
   1642   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1643   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   1644   ret <4 x float> %2
   1645 }
   1646 
   1647 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
   1648 ; ALL-LABEL: combine_test16:
   1649 ; ALL:       # BB#0:
   1650 ; ALL-NEXT:    retq
   1651   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1652   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1653   ret <4 x i32> %2
   1654 }
   1655 
   1656 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
   1657 ; SSE2-LABEL: combine_test17:
   1658 ; SSE2:       # BB#0:
   1659 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1660 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1661 ; SSE2-NEXT:    retq
   1662 ;
   1663 ; SSSE3-LABEL: combine_test17:
   1664 ; SSSE3:       # BB#0:
   1665 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1666 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1667 ; SSSE3-NEXT:    retq
   1668 ;
   1669 ; SSE41-LABEL: combine_test17:
   1670 ; SSE41:       # BB#0:
   1671 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1672 ; SSE41-NEXT:    retq
   1673 ;
   1674 ; AVX1-LABEL: combine_test17:
   1675 ; AVX1:       # BB#0:
   1676 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1677 ; AVX1-NEXT:    retq
   1678 ;
   1679 ; AVX2-LABEL: combine_test17:
   1680 ; AVX2:       # BB#0:
   1681 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1682 ; AVX2-NEXT:    retq
   1683   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1684   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   1685   ret <4 x i32> %2
   1686 }
   1687 
   1688 define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
   1689 ; SSE-LABEL: combine_test18:
   1690 ; SSE:       # BB#0:
   1691 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1692 ; SSE-NEXT:    retq
   1693 ;
   1694 ; AVX-LABEL: combine_test18:
   1695 ; AVX:       # BB#0:
   1696 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1697 ; AVX-NEXT:    retq
   1698   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1699   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1700   ret <4 x i32> %2
   1701 }
   1702 
   1703 define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
   1704 ; SSE-LABEL: combine_test19:
   1705 ; SSE:       # BB#0:
   1706 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1707 ; SSE-NEXT:    retq
   1708 ;
   1709 ; AVX-LABEL: combine_test19:
   1710 ; AVX:       # BB#0:
   1711 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1712 ; AVX-NEXT:    retq
   1713   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
   1714   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1715   ret <4 x i32> %2
   1716 }
   1717 
   1718 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
   1719 ; SSE2-LABEL: combine_test20:
   1720 ; SSE2:       # BB#0:
   1721 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1722 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1723 ; SSE2-NEXT:    retq
   1724 ;
   1725 ; SSSE3-LABEL: combine_test20:
   1726 ; SSSE3:       # BB#0:
   1727 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1728 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1729 ; SSSE3-NEXT:    retq
   1730 ;
   1731 ; SSE41-LABEL: combine_test20:
   1732 ; SSE41:       # BB#0:
   1733 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1734 ; SSE41-NEXT:    retq
   1735 ;
   1736 ; AVX1-LABEL: combine_test20:
   1737 ; AVX1:       # BB#0:
   1738 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1739 ; AVX1-NEXT:    retq
   1740 ;
   1741 ; AVX2-LABEL: combine_test20:
   1742 ; AVX2:       # BB#0:
   1743 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1744 ; AVX2-NEXT:    retq
   1745   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1746   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   1747   ret <4 x i32> %2
   1748 }
   1749 
   1750 define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
   1751 ; SSE-LABEL: combine_test21:
   1752 ; SSE:       # BB#0:
   1753 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   1754 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
   1755 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1756 ; SSE-NEXT:    movdqa %xmm2, (%rdi)
   1757 ; SSE-NEXT:    retq
   1758 ;
   1759 ; AVX1-LABEL: combine_test21:
   1760 ; AVX1:       # BB#0:
   1761 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1762 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
   1763 ; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1764 ; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
   1765 ; AVX1-NEXT:    vzeroupper
   1766 ; AVX1-NEXT:    retq
   1767 ;
   1768 ; AVX2-LABEL: combine_test21:
   1769 ; AVX2:       # BB#0:
   1770 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1771 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
   1772 ; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1773 ; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
   1774 ; AVX2-NEXT:    vzeroupper
   1775 ; AVX2-NEXT:    retq
   1776   %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1777   %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1778   store <4 x i32> %1, <4 x i32>* %ptr, align 16
   1779   ret <4 x i32> %2
   1780 }
   1781 
   1782 define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
   1783 ; SSE-LABEL: combine_test22:
   1784 ; SSE:       # BB#0:
   1785 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   1786 ; SSE-NEXT:    movhpd (%rsi), %xmm0
   1787 ; SSE-NEXT:    retq
   1788 ;
   1789 ; AVX-LABEL: combine_test22:
   1790 ; AVX:       # BB#0:
   1791 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
   1792 ; AVX-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
   1793 ; AVX-NEXT:    retq
   1794 ; Current AVX2 lowering of this is still awful, not adding a test case.
   1795   %1 = load <2 x float>, <2 x float>* %a, align 8
   1796   %2 = load <2 x float>, <2 x float>* %b, align 8
   1797   %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   1798   ret <8 x float> %3
   1799 }
   1800 
   1801 ; Check some negative cases.
   1802 ; FIXME: Do any of these really make sense? Are they redundant with the above tests?
   1803 
   1804 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
   1805 ; SSE-LABEL: combine_test1b:
   1806 ; SSE:       # BB#0:
   1807 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
   1808 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1809 ; SSE-NEXT:    retq
   1810 ;
   1811 ; AVX-LABEL: combine_test1b:
   1812 ; AVX:       # BB#0:
   1813 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
   1814 ; AVX-NEXT:    retq
   1815   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1816   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
   1817   ret <4 x float> %2
   1818 }
   1819 
   1820 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
   1821 ; SSE2-LABEL: combine_test2b:
   1822 ; SSE2:       # BB#0:
   1823 ; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
   1824 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1825 ; SSE2-NEXT:    retq
   1826 ;
   1827 ; SSSE3-LABEL: combine_test2b:
   1828 ; SSSE3:       # BB#0:
   1829 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
   1830 ; SSSE3-NEXT:    retq
   1831 ;
   1832 ; SSE41-LABEL: combine_test2b:
   1833 ; SSE41:       # BB#0:
   1834 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
   1835 ; SSE41-NEXT:    retq
   1836 ;
   1837 ; AVX-LABEL: combine_test2b:
   1838 ; AVX:       # BB#0:
   1839 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
   1840 ; AVX-NEXT:    retq
   1841   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1842   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
   1843   ret <4 x float> %2
   1844 }
   1845 
   1846 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
   1847 ; SSE2-LABEL: combine_test3b:
   1848 ; SSE2:       # BB#0:
   1849 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
   1850 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
   1851 ; SSE2-NEXT:    retq
   1852 ;
   1853 ; SSSE3-LABEL: combine_test3b:
   1854 ; SSSE3:       # BB#0:
   1855 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
   1856 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
   1857 ; SSSE3-NEXT:    retq
   1858 ;
   1859 ; SSE41-LABEL: combine_test3b:
   1860 ; SSE41:       # BB#0:
   1861 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   1862 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
   1863 ; SSE41-NEXT:    retq
   1864 ;
   1865 ; AVX-LABEL: combine_test3b:
   1866 ; AVX:       # BB#0:
   1867 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   1868 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
   1869 ; AVX-NEXT:    retq
   1870   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
   1871   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
   1872   ret <4 x float> %2
   1873 }
   1874 
   1875 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
   1876 ; SSE-LABEL: combine_test4b:
   1877 ; SSE:       # BB#0:
   1878 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
   1879 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1880 ; SSE-NEXT:    retq
   1881 ;
   1882 ; AVX-LABEL: combine_test4b:
   1883 ; AVX:       # BB#0:
   1884 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1885 ; AVX-NEXT:    retq
   1886   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1887   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
   1888   ret <4 x float> %2
   1889 }
   1890 
   1891 
   1892 ; Verify that we correctly fold shuffles even when we use illegal vector types.
   1893 
   1894 define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
   1895 ; SSE2-LABEL: combine_test1c:
   1896 ; SSE2:       # BB#0:
   1897 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1898 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1899 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1900 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1901 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1902 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1903 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1904 ; SSE2-NEXT:    retq
   1905 ;
   1906 ; SSSE3-LABEL: combine_test1c:
   1907 ; SSSE3:       # BB#0:
   1908 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1909 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1910 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1911 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1912 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1913 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1914 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1915 ; SSSE3-NEXT:    retq
   1916 ;
   1917 ; SSE41-LABEL: combine_test1c:
   1918 ; SSE41:       # BB#0:
   1919 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1920 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1921 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
   1922 ; SSE41-NEXT:    retq
   1923 ;
   1924 ; AVX1-LABEL: combine_test1c:
   1925 ; AVX1:       # BB#0:
   1926 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1927 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1928 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1929 ; AVX1-NEXT:    retq
   1930 ;
   1931 ; AVX2-LABEL: combine_test1c:
   1932 ; AVX2:       # BB#0:
   1933 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1934 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1935 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1936 ; AVX2-NEXT:    retq
   1937   %A = load <4 x i8>, <4 x i8>* %a
   1938   %B = load <4 x i8>, <4 x i8>* %b
   1939   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1940   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1941   ret <4 x i8> %2
   1942 }
   1943 
   1944 define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
   1945 ; SSE2-LABEL: combine_test2c:
   1946 ; SSE2:       # BB#0:
   1947 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1948 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1949 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1950 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1951 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1952 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1953 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1954 ; SSE2-NEXT:    retq
   1955 ;
   1956 ; SSSE3-LABEL: combine_test2c:
   1957 ; SSSE3:       # BB#0:
   1958 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1959 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1960 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1961 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1962 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1963 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1964 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1965 ; SSSE3-NEXT:    retq
   1966 ;
   1967 ; SSE41-LABEL: combine_test2c:
   1968 ; SSE41:       # BB#0:
   1969 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1970 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1971 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1972 ; SSE41-NEXT:    retq
   1973 ;
   1974 ; AVX-LABEL: combine_test2c:
   1975 ; AVX:       # BB#0:
   1976 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1977 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1978 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1979 ; AVX-NEXT:    retq
   1980   %A = load <4 x i8>, <4 x i8>* %a
   1981   %B = load <4 x i8>, <4 x i8>* %b
   1982   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
   1983   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1984   ret <4 x i8> %2
   1985 }
   1986 
   1987 define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
   1988 ; SSE2-LABEL: combine_test3c:
   1989 ; SSE2:       # BB#0:
   1990 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1991 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1992 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1993 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1994 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1995 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1996 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1997 ; SSE2-NEXT:    retq
   1998 ;
   1999 ; SSSE3-LABEL: combine_test3c:
   2000 ; SSSE3:       # BB#0:
   2001 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2002 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2003 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2004 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2005 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2006 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2007 ; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   2008 ; SSSE3-NEXT:    retq
   2009 ;
   2010 ; SSE41-LABEL: combine_test3c:
   2011 ; SSE41:       # BB#0:
   2012 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2013 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2014 ; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   2015 ; SSE41-NEXT:    retq
   2016 ;
   2017 ; AVX-LABEL: combine_test3c:
   2018 ; AVX:       # BB#0:
   2019 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2020 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2021 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2022 ; AVX-NEXT:    retq
   2023   %A = load <4 x i8>, <4 x i8>* %a
   2024   %B = load <4 x i8>, <4 x i8>* %b
   2025   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2026   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   2027   ret <4 x i8> %2
   2028 }
   2029 
   2030 define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
   2031 ; SSE2-LABEL: combine_test4c:
   2032 ; SSE2:       # BB#0:
   2033 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2034 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2035 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2036 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2037 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2038 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2039 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   2040 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   2041 ; SSE2-NEXT:    retq
   2042 ;
   2043 ; SSSE3-LABEL: combine_test4c:
   2044 ; SSSE3:       # BB#0:
   2045 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2046 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2047 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2048 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2049 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2050 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2051 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   2052 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   2053 ; SSSE3-NEXT:    retq
   2054 ;
   2055 ; SSE41-LABEL: combine_test4c:
   2056 ; SSE41:       # BB#0:
   2057 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2058 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2059 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   2060 ; SSE41-NEXT:    retq
   2061 ;
   2062 ; AVX1-LABEL: combine_test4c:
   2063 ; AVX1:       # BB#0:
   2064 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2065 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2066 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   2067 ; AVX1-NEXT:    retq
   2068 ;
   2069 ; AVX2-LABEL: combine_test4c:
   2070 ; AVX2:       # BB#0:
   2071 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2072 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2073 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   2074 ; AVX2-NEXT:    retq
   2075   %A = load <4 x i8>, <4 x i8>* %a
   2076   %B = load <4 x i8>, <4 x i8>* %b
   2077   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   2078   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   2079   ret <4 x i8> %2
   2080 }
   2081 
   2082 
   2083 ; The following test cases are generated from this C++ code
   2084 ;
   2085 ;__m128 blend_01(__m128 a, __m128 b)
   2086 ;{
   2087 ;  __m128 s = a;
   2088 ;  s = _mm_blend_ps( s, b, 1<<0 );
   2089 ;  s = _mm_blend_ps( s, b, 1<<1 );
   2090 ;  return s;
   2091 ;}
   2092 ;
   2093 ;__m128 blend_02(__m128 a, __m128 b)
   2094 ;{
   2095 ;  __m128 s = a;
   2096 ;  s = _mm_blend_ps( s, b, 1<<0 );
   2097 ;  s = _mm_blend_ps( s, b, 1<<2 );
   2098 ;  return s;
   2099 ;}
   2100 ;
   2101 ;__m128 blend_123(__m128 a, __m128 b)
   2102 ;{
   2103 ;  __m128 s = a;
   2104 ;  s = _mm_blend_ps( s, b, 1<<1 );
   2105 ;  s = _mm_blend_ps( s, b, 1<<2 );
   2106 ;  s = _mm_blend_ps( s, b, 1<<3 );
   2107 ;  return s;
   2108 ;}
   2109 
   2110 ; Ideally, we should collapse the following shuffles into a single one.
   2111 
   2112 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
   2113 ; SSE2-LABEL: combine_blend_01:
   2114 ; SSE2:       # BB#0:
   2115 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2116 ; SSE2-NEXT:    retq
   2117 ;
   2118 ; SSSE3-LABEL: combine_blend_01:
   2119 ; SSSE3:       # BB#0:
   2120 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2121 ; SSSE3-NEXT:    retq
   2122 ;
   2123 ; SSE41-LABEL: combine_blend_01:
   2124 ; SSE41:       # BB#0:
   2125 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2126 ; SSE41-NEXT:    retq
   2127 ;
   2128 ; AVX-LABEL: combine_blend_01:
   2129 ; AVX:       # BB#0:
   2130 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2131 ; AVX-NEXT:    retq
   2132   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
   2133   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   2134   ret <4 x float> %shuffle6
   2135 }
   2136 
   2137 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
   2138 ; SSE2-LABEL: combine_blend_02:
   2139 ; SSE2:       # BB#0:
   2140 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
   2141 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
   2142 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2143 ; SSE2-NEXT:    retq
   2144 ;
   2145 ; SSSE3-LABEL: combine_blend_02:
   2146 ; SSSE3:       # BB#0:
   2147 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
   2148 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
   2149 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2150 ; SSSE3-NEXT:    retq
   2151 ;
   2152 ; SSE41-LABEL: combine_blend_02:
   2153 ; SSE41:       # BB#0:
   2154 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   2155 ; SSE41-NEXT:    retq
   2156 ;
   2157 ; AVX-LABEL: combine_blend_02:
   2158 ; AVX:       # BB#0:
   2159 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   2160 ; AVX-NEXT:    retq
   2161   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
   2162   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   2163   ret <4 x float> %shuffle6
   2164 }
   2165 
   2166 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
   2167 ; SSE2-LABEL: combine_blend_123:
   2168 ; SSE2:       # BB#0:
   2169 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   2170 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2171 ; SSE2-NEXT:    retq
   2172 ;
   2173 ; SSSE3-LABEL: combine_blend_123:
   2174 ; SSSE3:       # BB#0:
   2175 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   2176 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2177 ; SSSE3-NEXT:    retq
   2178 ;
   2179 ; SSE41-LABEL: combine_blend_123:
   2180 ; SSE41:       # BB#0:
   2181 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   2182 ; SSE41-NEXT:    retq
   2183 ;
   2184 ; AVX-LABEL: combine_blend_123:
   2185 ; AVX:       # BB#0:
   2186 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   2187 ; AVX-NEXT:    retq
   2188   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
   2189   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
   2190   %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   2191   ret <4 x float> %shuffle12
   2192 }
   2193 
   2194 define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
   2195 ; SSE-LABEL: combine_test_movhl_1:
   2196 ; SSE:       # BB#0:
   2197 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2198 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   2199 ; SSE-NEXT:    retq
   2200 ;
   2201 ; AVX-LABEL: combine_test_movhl_1:
   2202 ; AVX:       # BB#0:
   2203 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2204 ; AVX-NEXT:    retq
   2205   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
   2206   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
   2207   ret <4 x i32> %2
   2208 }
   2209 
   2210 define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
   2211 ; SSE-LABEL: combine_test_movhl_2:
   2212 ; SSE:       # BB#0:
   2213 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2214 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   2215 ; SSE-NEXT:    retq
   2216 ;
   2217 ; AVX-LABEL: combine_test_movhl_2:
   2218 ; AVX:       # BB#0:
   2219 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2220 ; AVX-NEXT:    retq
   2221   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
   2222   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
   2223   ret <4 x i32> %2
   2224 }
   2225 
   2226 define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
   2227 ; SSE-LABEL: combine_test_movhl_3:
   2228 ; SSE:       # BB#0:
   2229 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2230 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   2231 ; SSE-NEXT:    retq
   2232 ;
   2233 ; AVX-LABEL: combine_test_movhl_3:
   2234 ; AVX:       # BB#0:
   2235 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2236 ; AVX-NEXT:    retq
   2237   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
   2238   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
   2239   ret <4 x i32> %2
   2240 }
   2241 
   2242 
   2243 ; Verify that we fold shuffles according to rule:
   2244 ;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
   2245 
   2246 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
   2247 ; SSE2-LABEL: combine_undef_input_test1:
   2248 ; SSE2:       # BB#0:
   2249 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2250 ; SSE2-NEXT:    retq
   2251 ;
   2252 ; SSSE3-LABEL: combine_undef_input_test1:
   2253 ; SSSE3:       # BB#0:
   2254 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2255 ; SSSE3-NEXT:    retq
   2256 ;
   2257 ; SSE41-LABEL: combine_undef_input_test1:
   2258 ; SSE41:       # BB#0:
   2259 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2260 ; SSE41-NEXT:    retq
   2261 ;
   2262 ; AVX-LABEL: combine_undef_input_test1:
   2263 ; AVX:       # BB#0:
   2264 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2265 ; AVX-NEXT:    retq
   2266   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2267   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
   2268   ret <4 x float> %2
   2269 }
   2270 
   2271 define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
   2272 ; SSE-LABEL: combine_undef_input_test2:
   2273 ; SSE:       # BB#0:
   2274 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2275 ; SSE-NEXT:    retq
   2276 ;
   2277 ; AVX-LABEL: combine_undef_input_test2:
   2278 ; AVX:       # BB#0:
   2279 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2280 ; AVX-NEXT:    retq
   2281   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2282   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   2283   ret <4 x float> %2
   2284 }
   2285 
   2286 define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
   2287 ; SSE-LABEL: combine_undef_input_test3:
   2288 ; SSE:       # BB#0:
   2289 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2290 ; SSE-NEXT:    retq
   2291 ;
   2292 ; AVX-LABEL: combine_undef_input_test3:
   2293 ; AVX:       # BB#0:
   2294 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2295 ; AVX-NEXT:    retq
   2296   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2297   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   2298   ret <4 x float> %2
   2299 }
   2300 
   2301 define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
   2302 ; SSE-LABEL: combine_undef_input_test4:
   2303 ; SSE:       # BB#0:
   2304 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2305 ; SSE-NEXT:    movapd %xmm1, %xmm0
   2306 ; SSE-NEXT:    retq
   2307 ;
   2308 ; AVX-LABEL: combine_undef_input_test4:
   2309 ; AVX:       # BB#0:
   2310 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2311 ; AVX-NEXT:    retq
   2312   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2313   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   2314   ret <4 x float> %2
   2315 }
   2316 
   2317 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
   2318 ; SSE2-LABEL: combine_undef_input_test5:
   2319 ; SSE2:       # BB#0:
   2320 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2321 ; SSE2-NEXT:    movapd %xmm1, %xmm0
   2322 ; SSE2-NEXT:    retq
   2323 ;
   2324 ; SSSE3-LABEL: combine_undef_input_test5:
   2325 ; SSSE3:       # BB#0:
   2326 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2327 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
   2328 ; SSSE3-NEXT:    retq
   2329 ;
   2330 ; SSE41-LABEL: combine_undef_input_test5:
   2331 ; SSE41:       # BB#0:
   2332 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   2333 ; SSE41-NEXT:    retq
   2334 ;
   2335 ; AVX-LABEL: combine_undef_input_test5:
   2336 ; AVX:       # BB#0:
   2337 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   2338 ; AVX-NEXT:    retq
   2339   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2340   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
   2341   ret <4 x float> %2
   2342 }
   2343 
   2344 
   2345 ; Verify that we fold shuffles according to rule:
   2346 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
   2347 
   2348 define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
   2349 ; ALL-LABEL: combine_undef_input_test6:
   2350 ; ALL:       # BB#0:
   2351 ; ALL-NEXT:    retq
   2352   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2353   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
   2354   ret <4 x float> %2
   2355 }
   2356 
   2357 define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
   2358 ; SSE2-LABEL: combine_undef_input_test7:
   2359 ; SSE2:       # BB#0:
   2360 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2361 ; SSE2-NEXT:    retq
   2362 ;
   2363 ; SSSE3-LABEL: combine_undef_input_test7:
   2364 ; SSSE3:       # BB#0:
   2365 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2366 ; SSSE3-NEXT:    retq
   2367 ;
   2368 ; SSE41-LABEL: combine_undef_input_test7:
   2369 ; SSE41:       # BB#0:
   2370 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2371 ; SSE41-NEXT:    retq
   2372 ;
   2373 ; AVX-LABEL: combine_undef_input_test7:
   2374 ; AVX:       # BB#0:
   2375 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2376 ; AVX-NEXT:    retq
   2377   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2378   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   2379   ret <4 x float> %2
   2380 }
   2381 
   2382 define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
   2383 ; SSE2-LABEL: combine_undef_input_test8:
   2384 ; SSE2:       # BB#0:
   2385 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2386 ; SSE2-NEXT:    retq
   2387 ;
   2388 ; SSSE3-LABEL: combine_undef_input_test8:
   2389 ; SSSE3:       # BB#0:
   2390 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2391 ; SSSE3-NEXT:    retq
   2392 ;
   2393 ; SSE41-LABEL: combine_undef_input_test8:
   2394 ; SSE41:       # BB#0:
   2395 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2396 ; SSE41-NEXT:    retq
   2397 ;
   2398 ; AVX-LABEL: combine_undef_input_test8:
   2399 ; AVX:       # BB#0:
   2400 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2401 ; AVX-NEXT:    retq
   2402   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2403   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   2404   ret <4 x float> %2
   2405 }
   2406 
   2407 define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
   2408 ; SSE-LABEL: combine_undef_input_test9:
   2409 ; SSE:       # BB#0:
   2410 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   2411 ; SSE-NEXT:    retq
   2412 ;
   2413 ; AVX-LABEL: combine_undef_input_test9:
   2414 ; AVX:       # BB#0:
   2415 ; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
   2416 ; AVX-NEXT:    retq
   2417   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2418   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   2419   ret <4 x float> %2
   2420 }
   2421 
   2422 define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
   2423 ; ALL-LABEL: combine_undef_input_test10:
   2424 ; ALL:       # BB#0:
   2425 ; ALL-NEXT:    retq
   2426   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2427   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
   2428   ret <4 x float> %2
   2429 }
   2430 
   2431 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
   2432 ; SSE2-LABEL: combine_undef_input_test11:
   2433 ; SSE2:       # BB#0:
   2434 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2435 ; SSE2-NEXT:    retq
   2436 ;
   2437 ; SSSE3-LABEL: combine_undef_input_test11:
   2438 ; SSSE3:       # BB#0:
   2439 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2440 ; SSSE3-NEXT:    retq
   2441 ;
   2442 ; SSE41-LABEL: combine_undef_input_test11:
   2443 ; SSE41:       # BB#0:
   2444 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2445 ; SSE41-NEXT:    retq
   2446 ;
   2447 ; AVX-LABEL: combine_undef_input_test11:
   2448 ; AVX:       # BB#0:
   2449 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2450 ; AVX-NEXT:    retq
   2451   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2452   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
   2453   ret <4 x float> %2
   2454 }
   2455 
   2456 define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
   2457 ; SSE-LABEL: combine_undef_input_test12:
   2458 ; SSE:       # BB#0:
   2459 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2460 ; SSE-NEXT:    retq
   2461 ;
   2462 ; AVX-LABEL: combine_undef_input_test12:
   2463 ; AVX:       # BB#0:
   2464 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2465 ; AVX-NEXT:    retq
   2466   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2467   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
   2468   ret <4 x float> %2
   2469 }
   2470 
   2471 define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
   2472 ; SSE-LABEL: combine_undef_input_test13:
   2473 ; SSE:       # BB#0:
   2474 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2475 ; SSE-NEXT:    retq
   2476 ;
   2477 ; AVX-LABEL: combine_undef_input_test13:
   2478 ; AVX:       # BB#0:
   2479 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2480 ; AVX-NEXT:    retq
   2481   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2482   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
   2483   ret <4 x float> %2
   2484 }
   2485 
   2486 define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
   2487 ; SSE-LABEL: combine_undef_input_test14:
   2488 ; SSE:       # BB#0:
   2489 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2490 ; SSE-NEXT:    movapd %xmm1, %xmm0
   2491 ; SSE-NEXT:    retq
   2492 ;
   2493 ; AVX-LABEL: combine_undef_input_test14:
   2494 ; AVX:       # BB#0:
   2495 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2496 ; AVX-NEXT:    retq
   2497   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2498   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   2499   ret <4 x float> %2
   2500 }
   2501 
   2502 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
   2503 ; SSE2-LABEL: combine_undef_input_test15:
   2504 ; SSE2:       # BB#0:
   2505 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2506 ; SSE2-NEXT:    movapd %xmm1, %xmm0
   2507 ; SSE2-NEXT:    retq
   2508 ;
   2509 ; SSSE3-LABEL: combine_undef_input_test15:
   2510 ; SSSE3:       # BB#0:
   2511 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2512 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
   2513 ; SSSE3-NEXT:    retq
   2514 ;
   2515 ; SSE41-LABEL: combine_undef_input_test15:
   2516 ; SSE41:       # BB#0:
   2517 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   2518 ; SSE41-NEXT:    retq
   2519 ;
   2520 ; AVX-LABEL: combine_undef_input_test15:
   2521 ; AVX:       # BB#0:
   2522 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   2523 ; AVX-NEXT:    retq
   2524   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2525   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2526   ret <4 x float> %2
   2527 }
   2528 
   2529 
   2530 ; Verify that shuffles are canonicalized according to rules:
   2531 ;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
   2532 ;
   2533 ; This allows to trigger the following combine rule:
   2534 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
   2535 ;
   2536 ; As a result, all the shuffle pairs in each function below should be
   2537 ; combined into a single legal shuffle operation.
   2538 
   2539 define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
   2540 ; ALL-LABEL: combine_undef_input_test16:
   2541 ; ALL:       # BB#0:
   2542 ; ALL-NEXT:    retq
   2543   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2544   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
   2545   ret <4 x float> %2
   2546 }
   2547 
   2548 define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
   2549 ; SSE2-LABEL: combine_undef_input_test17:
   2550 ; SSE2:       # BB#0:
   2551 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2552 ; SSE2-NEXT:    retq
   2553 ;
   2554 ; SSSE3-LABEL: combine_undef_input_test17:
   2555 ; SSSE3:       # BB#0:
   2556 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2557 ; SSSE3-NEXT:    retq
   2558 ;
   2559 ; SSE41-LABEL: combine_undef_input_test17:
   2560 ; SSE41:       # BB#0:
   2561 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2562 ; SSE41-NEXT:    retq
   2563 ;
   2564 ; AVX-LABEL: combine_undef_input_test17:
   2565 ; AVX:       # BB#0:
   2566 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2567 ; AVX-NEXT:    retq
   2568   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2569   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
   2570   ret <4 x float> %2
   2571 }
   2572 
   2573 define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
   2574 ; SSE2-LABEL: combine_undef_input_test18:
   2575 ; SSE2:       # BB#0:
   2576 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2577 ; SSE2-NEXT:    retq
   2578 ;
   2579 ; SSSE3-LABEL: combine_undef_input_test18:
   2580 ; SSSE3:       # BB#0:
   2581 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2582 ; SSSE3-NEXT:    retq
   2583 ;
   2584 ; SSE41-LABEL: combine_undef_input_test18:
   2585 ; SSE41:       # BB#0:
   2586 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2587 ; SSE41-NEXT:    retq
   2588 ;
   2589 ; AVX-LABEL: combine_undef_input_test18:
   2590 ; AVX:       # BB#0:
   2591 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2592 ; AVX-NEXT:    retq
   2593   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2594   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   2595   ret <4 x float> %2
   2596 }
   2597 
   2598 define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
   2599 ; SSE-LABEL: combine_undef_input_test19:
   2600 ; SSE:       # BB#0:
   2601 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   2602 ; SSE-NEXT:    retq
   2603 ;
   2604 ; AVX-LABEL: combine_undef_input_test19:
   2605 ; AVX:       # BB#0:
   2606 ; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
   2607 ; AVX-NEXT:    retq
   2608   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2609   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   2610   ret <4 x float> %2
   2611 }
   2612 
   2613 define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
   2614 ; ALL-LABEL: combine_undef_input_test20:
   2615 ; ALL:       # BB#0:
   2616 ; ALL-NEXT:    retq
   2617   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2618   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2619   ret <4 x float> %2
   2620 }
   2621 
   2622 ; These tests are designed to test the ability to combine away unnecessary
   2623 ; operations feeding into a shuffle. The AVX cases are the important ones as
   2624 ; they leverage operations which cannot be done naturally on the entire vector
   2625 ; and thus are decomposed into multiple smaller operations.
   2626 
   2627 define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
   2628 ; SSE-LABEL: combine_unneeded_subvector1:
   2629 ; SSE:       # BB#0:
   2630 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
   2631 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
   2632 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   2633 ; SSE-NEXT:    retq
   2634 ;
   2635 ; AVX1-LABEL: combine_unneeded_subvector1:
   2636 ; AVX1:       # BB#0:
   2637 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2638 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
   2639 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
   2640 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   2641 ; AVX1-NEXT:    retq
   2642 ;
   2643 ; AVX2-LABEL: combine_unneeded_subvector1:
   2644 ; AVX2:       # BB#0:
   2645 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
   2646 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
   2647 ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
   2648 ; AVX2-NEXT:    retq
   2649   %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   2650   %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
   2651   ret <8 x i32> %c
   2652 }
   2653 
   2654 define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
   2655 ; SSE-LABEL: combine_unneeded_subvector2:
   2656 ; SSE:       # BB#0:
   2657 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
   2658 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
   2659 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
   2660 ; SSE-NEXT:    retq
   2661 ;
   2662 ; AVX1-LABEL: combine_unneeded_subvector2:
   2663 ; AVX1:       # BB#0:
   2664 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2665 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
   2666 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   2667 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
   2668 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   2669 ; AVX1-NEXT:    retq
   2670 ;
   2671 ; AVX2-LABEL: combine_unneeded_subvector2:
   2672 ; AVX2:       # BB#0:
   2673 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
   2674 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
   2675 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   2676 ; AVX2-NEXT:    retq
   2677   %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   2678   %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
   2679   ret <8 x i32> %d
   2680 }
   2681 
   2682 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
   2683 ; SSE2-LABEL: combine_insertps1:
   2684 ; SSE2:       # BB#0:
   2685 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
   2686 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
   2687 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2688 ; SSE2-NEXT:    retq
   2689 ;
   2690 ; SSSE3-LABEL: combine_insertps1:
   2691 ; SSSE3:       # BB#0:
   2692 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
   2693 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
   2694 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2695 ; SSSE3-NEXT:    retq
   2696 ;
   2697 ; SSE41-LABEL: combine_insertps1:
   2698 ; SSE41:       # BB#0:
   2699 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
   2700 ; SSE41-NEXT:    retq
   2701 ;
   2702 ; AVX-LABEL: combine_insertps1:
   2703 ; AVX:       # BB#0:
   2704 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
   2705 ; AVX-NEXT:    retq
   2706 
   2707   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
   2708   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
   2709   ret <4 x float> %d
   2710 }
   2711 
   2712 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
   2713 ; SSE2-LABEL: combine_insertps2:
   2714 ; SSE2:       # BB#0:
   2715 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
   2716 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
   2717 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2718 ; SSE2-NEXT:    retq
   2719 ;
   2720 ; SSSE3-LABEL: combine_insertps2:
   2721 ; SSSE3:       # BB#0:
   2722 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
   2723 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
   2724 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2725 ; SSSE3-NEXT:    retq
   2726 ;
   2727 ; SSE41-LABEL: combine_insertps2:
   2728 ; SSE41:       # BB#0:
   2729 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
   2730 ; SSE41-NEXT:    retq
   2731 ;
   2732 ; AVX-LABEL: combine_insertps2:
   2733 ; AVX:       # BB#0:
   2734 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
   2735 ; AVX-NEXT:    retq
   2736 
   2737   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
   2738   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2739   ret <4 x float> %d
   2740 }
   2741 
   2742 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
   2743 ; SSE2-LABEL: combine_insertps3:
   2744 ; SSE2:       # BB#0:
   2745 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
   2746 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
   2747 ; SSE2-NEXT:    retq
   2748 ;
   2749 ; SSSE3-LABEL: combine_insertps3:
   2750 ; SSSE3:       # BB#0:
   2751 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
   2752 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
   2753 ; SSSE3-NEXT:    retq
   2754 ;
   2755 ; SSE41-LABEL: combine_insertps3:
   2756 ; SSE41:       # BB#0:
   2757 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   2758 ; SSE41-NEXT:    retq
   2759 ;
   2760 ; AVX-LABEL: combine_insertps3:
   2761 ; AVX:       # BB#0:
   2762 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   2763 ; AVX-NEXT:    retq
   2764 
   2765   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
   2766   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
   2767   ret <4 x float> %d
   2768 }
   2769 
   2770 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
   2771 ; SSE2-LABEL: combine_insertps4:
   2772 ; SSE2:       # BB#0:
   2773 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
   2774 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
   2775 ; SSE2-NEXT:    retq
   2776 ;
   2777 ; SSSE3-LABEL: combine_insertps4:
   2778 ; SSSE3:       # BB#0:
   2779 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
   2780 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
   2781 ; SSSE3-NEXT:    retq
   2782 ;
   2783 ; SSE41-LABEL: combine_insertps4:
   2784 ; SSE41:       # BB#0:
   2785 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   2786 ; SSE41-NEXT:    retq
   2787 ;
   2788 ; AVX-LABEL: combine_insertps4:
   2789 ; AVX:       # BB#0:
   2790 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   2791 ; AVX-NEXT:    retq
   2792 
   2793   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
   2794   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
   2795   ret <4 x float> %d
   2796 }
   2797 
   2798 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
   2799 ; SSE-LABEL: PR22377:
   2800 ; SSE:       # BB#0: # %entry
   2801 ; SSE-NEXT:    movaps %xmm0, %xmm1
   2802 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
   2803 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
   2804 ; SSE-NEXT:    addps %xmm0, %xmm1
   2805 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2806 ; SSE-NEXT:    retq
   2807 ;
   2808 ; AVX-LABEL: PR22377:
   2809 ; AVX:       # BB#0: # %entry
   2810 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
   2811 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
   2812 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
   2813 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2814 ; AVX-NEXT:    retq
   2815 entry:
   2816   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
   2817   %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
   2818   %r2 = fadd <4 x float> %s1, %s2
   2819   %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   2820   ret <4 x float> %s3
   2821 }
   2822 
   2823 define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
   2824 ; SSE2-LABEL: PR22390:
   2825 ; SSE2:       # BB#0: # %entry
   2826 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2827 ; SSE2-NEXT:    movaps %xmm0, %xmm2
   2828 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
   2829 ; SSE2-NEXT:    addps %xmm0, %xmm2
   2830 ; SSE2-NEXT:    movaps %xmm2, %xmm0
   2831 ; SSE2-NEXT:    retq
   2832 ;
   2833 ; SSSE3-LABEL: PR22390:
   2834 ; SSSE3:       # BB#0: # %entry
   2835 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2836 ; SSSE3-NEXT:    movaps %xmm0, %xmm2
   2837 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
   2838 ; SSSE3-NEXT:    addps %xmm0, %xmm2
   2839 ; SSSE3-NEXT:    movaps %xmm2, %xmm0
   2840 ; SSSE3-NEXT:    retq
   2841 ;
   2842 ; SSE41-LABEL: PR22390:
   2843 ; SSE41:       # BB#0: # %entry
   2844 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2845 ; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
   2846 ; SSE41-NEXT:    addps %xmm1, %xmm0
   2847 ; SSE41-NEXT:    retq
   2848 ;
   2849 ; AVX-LABEL: PR22390:
   2850 ; AVX:       # BB#0: # %entry
   2851 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2852 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
   2853 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   2854 ; AVX-NEXT:    retq
   2855 entry:
   2856   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   2857   %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   2858   %r2 = fadd <4 x float> %s1, %s2
   2859   ret <4 x float> %r2
   2860 }
   2861 
   2862 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
   2863 ; SSE2-LABEL: PR22412:
   2864 ; SSE2:       # BB#0: # %entry
   2865 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
   2866 ; SSE2-NEXT:    movapd %xmm2, %xmm0
   2867 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
   2868 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
   2869 ; SSE2-NEXT:    movaps %xmm3, %xmm1
   2870 ; SSE2-NEXT:    retq
   2871 ;
   2872 ; SSSE3-LABEL: PR22412:
   2873 ; SSSE3:       # BB#0: # %entry
   2874 ; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
   2875 ; SSSE3-NEXT:    movapd %xmm2, %xmm0
   2876 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
   2877 ; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
   2878 ; SSSE3-NEXT:    movaps %xmm3, %xmm1
   2879 ; SSSE3-NEXT:    retq
   2880 ;
   2881 ; SSE41-LABEL: PR22412:
   2882 ; SSE41:       # BB#0: # %entry
   2883 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
   2884 ; SSE41-NEXT:    movapd %xmm0, %xmm1
   2885 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
   2886 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
   2887 ; SSE41-NEXT:    movaps %xmm1, %xmm0
   2888 ; SSE41-NEXT:    movaps %xmm3, %xmm1
   2889 ; SSE41-NEXT:    retq
   2890 ;
   2891 ; AVX1-LABEL: PR22412:
   2892 ; AVX1:       # BB#0: # %entry
   2893 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
   2894 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
   2895 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
   2896 ; AVX1-NEXT:    retq
   2897 ;
   2898 ; AVX2-LABEL: PR22412:
   2899 ; AVX2:       # BB#0: # %entry
   2900 ; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
   2901 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
   2902 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   2903 ; AVX2-NEXT:    retq
   2904 entry:
   2905   %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   2906   %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
   2907   ret <8 x float> %s2
   2908 }
   2909