Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      4 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ;
      7 ; Verify that the DAG combiner correctly folds bitwise operations across
      8 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
      9 ; basic and always-safe patterns. Also test that the DAG combiner will combine
     10 ; target-specific shuffle instructions where reasonable.
     11 
     12 target triple = "x86_64-unknown-unknown"
     13 
     14 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
     15 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
     16 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
     17 
     18 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
     19 ; ALL-LABEL: combine_pshufd1:
     20 ; ALL:       # BB#0: # %entry
     21 ; ALL-NEXT:    retq
     22 entry:
     23   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     24   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
     25   ret <4 x i32> %c
     26 }
     27 
     28 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
     29 ; ALL-LABEL: combine_pshufd2:
     30 ; ALL:       # BB#0: # %entry
     31 ; ALL-NEXT:    retq
     32 entry:
     33   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     34   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     35   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
     36   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     37   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
     38   ret <4 x i32> %d
     39 }
     40 
     41 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
     42 ; ALL-LABEL: combine_pshufd3:
     43 ; ALL:       # BB#0: # %entry
     44 ; ALL-NEXT:    retq
     45 entry:
     46   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
     47   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     48   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
     49   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     50   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
     51   ret <4 x i32> %d
     52 }
     53 
     54 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
     55 ; SSE-LABEL: combine_pshufd4:
     56 ; SSE:       # BB#0: # %entry
     57 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
     58 ; SSE-NEXT:    retq
     59 ;
     60 ; AVX-LABEL: combine_pshufd4:
     61 ; AVX:       # BB#0: # %entry
     62 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
     63 ; AVX-NEXT:    retq
     64 entry:
     65   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
     66   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     67   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
     68   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     69   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
     70   ret <4 x i32> %d
     71 }
     72 
     73 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
     74 ; SSE-LABEL: combine_pshufd5:
     75 ; SSE:       # BB#0: # %entry
     76 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
     77 ; SSE-NEXT:    retq
     78 ;
     79 ; AVX-LABEL: combine_pshufd5:
     80 ; AVX:       # BB#0: # %entry
     81 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
     82 ; AVX-NEXT:    retq
     83 entry:
     84   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
     85   %b.cast = bitcast <4 x i32> %b to <8 x i16>
     86   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
     87   %c.cast = bitcast <8 x i16> %c to <4 x i32>
     88   %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
     89   ret <4 x i32> %d
     90 }
     91 
     92 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
     93 ; SSE-LABEL: combine_pshufd6:
     94 ; SSE:       # BB#0: # %entry
     95 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
     96 ; SSE-NEXT:    retq
     97 ;
     98 ; AVX-LABEL: combine_pshufd6:
     99 ; AVX:       # BB#0: # %entry
    100 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    101 ; AVX-NEXT:    retq
    102 entry:
    103   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
    104   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
    105   ret <4 x i32> %c
    106 }
    107 
    108 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
    109 ; ALL-LABEL: combine_pshuflw1:
    110 ; ALL:       # BB#0: # %entry
    111 ; ALL-NEXT:    retq
    112 entry:
    113   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    114   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
    115   ret <8 x i16> %c
    116 }
    117 
    118 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
    119 ; ALL-LABEL: combine_pshuflw2:
    120 ; ALL:       # BB#0: # %entry
    121 ; ALL-NEXT:    retq
    122 entry:
    123   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    124   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
    125   %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
    126   ret <8 x i16> %d
    127 }
    128 
    129 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
    130 ; SSE-LABEL: combine_pshuflw3:
    131 ; SSE:       # BB#0: # %entry
    132 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    133 ; SSE-NEXT:    retq
    134 ;
    135 ; AVX-LABEL: combine_pshuflw3:
    136 ; AVX:       # BB#0: # %entry
    137 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    138 ; AVX-NEXT:    retq
    139 entry:
    140   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
    141   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
    142   %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
    143   ret <8 x i16> %d
    144 }
    145 
    146 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
    147 ; SSE-LABEL: combine_pshufhw1:
    148 ; SSE:       # BB#0: # %entry
    149 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    150 ; SSE-NEXT:    retq
    151 ;
    152 ; AVX-LABEL: combine_pshufhw1:
    153 ; AVX:       # BB#0: # %entry
    154 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    155 ; AVX-NEXT:    retq
    156 entry:
    157   %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
    158   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
    159   %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
    160   ret <8 x i16> %d
    161 }
    162 
    163 define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    164 ; SSE-LABEL: combine_bitwise_ops_test1:
    165 ; SSE:       # BB#0:
    166 ; SSE-NEXT:    pand %xmm1, %xmm0
    167 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    168 ; SSE-NEXT:    retq
    169 ;
    170 ; AVX-LABEL: combine_bitwise_ops_test1:
    171 ; AVX:       # BB#0:
    172 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    173 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    174 ; AVX-NEXT:    retq
    175   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    176   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    177   %and = and <4 x i32> %shuf1, %shuf2
    178   ret <4 x i32> %and
    179 }
    180 
    181 define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    182 ; SSE-LABEL: combine_bitwise_ops_test2:
    183 ; SSE:       # BB#0:
    184 ; SSE-NEXT:    por %xmm1, %xmm0
    185 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    186 ; SSE-NEXT:    retq
    187 ;
    188 ; AVX-LABEL: combine_bitwise_ops_test2:
    189 ; AVX:       # BB#0:
    190 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
    191 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    192 ; AVX-NEXT:    retq
    193   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    194   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    195   %or = or <4 x i32> %shuf1, %shuf2
    196   ret <4 x i32> %or
    197 }
    198 
    199 define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    200 ; SSE-LABEL: combine_bitwise_ops_test3:
    201 ; SSE:       # BB#0:
    202 ; SSE-NEXT:    pxor %xmm1, %xmm0
    203 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    204 ; SSE-NEXT:    retq
    205 ;
    206 ; AVX-LABEL: combine_bitwise_ops_test3:
    207 ; AVX:       # BB#0:
    208 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    209 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    210 ; AVX-NEXT:    retq
    211   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    212   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
    213   %xor = xor <4 x i32> %shuf1, %shuf2
    214   ret <4 x i32> %xor
    215 }
    216 
    217 define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    218 ; SSE-LABEL: combine_bitwise_ops_test4:
    219 ; SSE:       # BB#0:
    220 ; SSE-NEXT:    pand %xmm1, %xmm0
    221 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    222 ; SSE-NEXT:    retq
    223 ;
    224 ; AVX-LABEL: combine_bitwise_ops_test4:
    225 ; AVX:       # BB#0:
    226 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    227 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    228 ; AVX-NEXT:    retq
    229   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    230   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    231   %and = and <4 x i32> %shuf1, %shuf2
    232   ret <4 x i32> %and
    233 }
    234 
    235 define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    236 ; SSE-LABEL: combine_bitwise_ops_test5:
    237 ; SSE:       # BB#0:
    238 ; SSE-NEXT:    por %xmm1, %xmm0
    239 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    240 ; SSE-NEXT:    retq
    241 ;
    242 ; AVX-LABEL: combine_bitwise_ops_test5:
    243 ; AVX:       # BB#0:
    244 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
    245 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    246 ; AVX-NEXT:    retq
    247   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    248   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    249   %or = or <4 x i32> %shuf1, %shuf2
    250   ret <4 x i32> %or
    251 }
    252 
    253 define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    254 ; SSE-LABEL: combine_bitwise_ops_test6:
    255 ; SSE:       # BB#0:
    256 ; SSE-NEXT:    pxor %xmm1, %xmm0
    257 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    258 ; SSE-NEXT:    retq
    259 ;
    260 ; AVX-LABEL: combine_bitwise_ops_test6:
    261 ; AVX:       # BB#0:
    262 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    263 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    264 ; AVX-NEXT:    retq
    265   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    266   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
    267   %xor = xor <4 x i32> %shuf1, %shuf2
    268   ret <4 x i32> %xor
    269 }
    270 
    271 
    272 ; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
    273 ; are not performing a swizzle operations.
    274 
    275 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    276 ; SSE2-LABEL: combine_bitwise_ops_test1b:
    277 ; SSE2:       # BB#0:
    278 ; SSE2-NEXT:    pand %xmm1, %xmm0
    279 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    280 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    281 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    282 ; SSE2-NEXT:    retq
    283 ;
    284 ; SSSE3-LABEL: combine_bitwise_ops_test1b:
    285 ; SSSE3:       # BB#0:
    286 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    287 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    288 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    289 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    290 ; SSSE3-NEXT:    retq
    291 ;
    292 ; SSE41-LABEL: combine_bitwise_ops_test1b:
    293 ; SSE41:       # BB#0:
    294 ; SSE41-NEXT:    pand %xmm1, %xmm0
    295 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    296 ; SSE41-NEXT:    retq
    297 ;
    298 ; AVX1-LABEL: combine_bitwise_ops_test1b:
    299 ; AVX1:       # BB#0:
    300 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    301 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    302 ; AVX1-NEXT:    retq
    303 ;
    304 ; AVX2-LABEL: combine_bitwise_ops_test1b:
    305 ; AVX2:       # BB#0:
    306 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    307 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    308 ; AVX2-NEXT:    retq
    309   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    310   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    311   %and = and <4 x i32> %shuf1, %shuf2
    312   ret <4 x i32> %and
    313 }
    314 
    315 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    316 ; SSE2-LABEL: combine_bitwise_ops_test2b:
    317 ; SSE2:       # BB#0:
    318 ; SSE2-NEXT:    por %xmm1, %xmm0
    319 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    320 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    321 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    322 ; SSE2-NEXT:    retq
    323 ;
    324 ; SSSE3-LABEL: combine_bitwise_ops_test2b:
    325 ; SSSE3:       # BB#0:
    326 ; SSSE3-NEXT:    por %xmm1, %xmm0
    327 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    328 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    329 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    330 ; SSSE3-NEXT:    retq
    331 ;
    332 ; SSE41-LABEL: combine_bitwise_ops_test2b:
    333 ; SSE41:       # BB#0:
    334 ; SSE41-NEXT:    por %xmm1, %xmm0
    335 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    336 ; SSE41-NEXT:    retq
    337 ;
    338 ; AVX1-LABEL: combine_bitwise_ops_test2b:
    339 ; AVX1:       # BB#0:
    340 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    341 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    342 ; AVX1-NEXT:    retq
    343 ;
    344 ; AVX2-LABEL: combine_bitwise_ops_test2b:
    345 ; AVX2:       # BB#0:
    346 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
    347 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    348 ; AVX2-NEXT:    retq
    349   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    350   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    351   %or = or <4 x i32> %shuf1, %shuf2
    352   ret <4 x i32> %or
    353 }
    354 
    355 define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    356 ; SSE2-LABEL: combine_bitwise_ops_test3b:
    357 ; SSE2:       # BB#0:
    358 ; SSE2-NEXT:    xorps %xmm1, %xmm0
    359 ; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
    360 ; SSE2-NEXT:    retq
    361 ;
    362 ; SSSE3-LABEL: combine_bitwise_ops_test3b:
    363 ; SSSE3:       # BB#0:
    364 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
    365 ; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
    366 ; SSSE3-NEXT:    retq
    367 ;
    368 ; SSE41-LABEL: combine_bitwise_ops_test3b:
    369 ; SSE41:       # BB#0:
    370 ; SSE41-NEXT:    pxor %xmm1, %xmm0
    371 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    372 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    373 ; SSE41-NEXT:    retq
    374 ;
    375 ; AVX1-LABEL: combine_bitwise_ops_test3b:
    376 ; AVX1:       # BB#0:
    377 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    378 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    379 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    380 ; AVX1-NEXT:    retq
    381 ;
    382 ; AVX2-LABEL: combine_bitwise_ops_test3b:
    383 ; AVX2:       # BB#0:
    384 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    385 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    386 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    387 ; AVX2-NEXT:    retq
    388   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    389   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    390   %xor = xor <4 x i32> %shuf1, %shuf2
    391   ret <4 x i32> %xor
    392 }
    393 
    394 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    395 ; SSE2-LABEL: combine_bitwise_ops_test4b:
    396 ; SSE2:       # BB#0:
    397 ; SSE2-NEXT:    pand %xmm1, %xmm0
    398 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    399 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    400 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    401 ; SSE2-NEXT:    retq
    402 ;
    403 ; SSSE3-LABEL: combine_bitwise_ops_test4b:
    404 ; SSSE3:       # BB#0:
    405 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    406 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    407 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    408 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    409 ; SSSE3-NEXT:    retq
    410 ;
    411 ; SSE41-LABEL: combine_bitwise_ops_test4b:
    412 ; SSE41:       # BB#0:
    413 ; SSE41-NEXT:    pand %xmm1, %xmm0
    414 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    415 ; SSE41-NEXT:    retq
    416 ;
    417 ; AVX1-LABEL: combine_bitwise_ops_test4b:
    418 ; AVX1:       # BB#0:
    419 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    420 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    421 ; AVX1-NEXT:    retq
    422 ;
    423 ; AVX2-LABEL: combine_bitwise_ops_test4b:
    424 ; AVX2:       # BB#0:
    425 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    426 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    427 ; AVX2-NEXT:    retq
    428   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    429   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    430   %and = and <4 x i32> %shuf1, %shuf2
    431   ret <4 x i32> %and
    432 }
    433 
    434 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    435 ; SSE2-LABEL: combine_bitwise_ops_test5b:
    436 ; SSE2:       # BB#0:
    437 ; SSE2-NEXT:    por %xmm1, %xmm0
    438 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    439 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    440 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    441 ; SSE2-NEXT:    retq
    442 ;
    443 ; SSSE3-LABEL: combine_bitwise_ops_test5b:
    444 ; SSSE3:       # BB#0:
    445 ; SSSE3-NEXT:    por %xmm1, %xmm0
    446 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    447 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    448 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    449 ; SSSE3-NEXT:    retq
    450 ;
    451 ; SSE41-LABEL: combine_bitwise_ops_test5b:
    452 ; SSE41:       # BB#0:
    453 ; SSE41-NEXT:    por %xmm1, %xmm0
    454 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    455 ; SSE41-NEXT:    retq
    456 ;
    457 ; AVX1-LABEL: combine_bitwise_ops_test5b:
    458 ; AVX1:       # BB#0:
    459 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    460 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    461 ; AVX1-NEXT:    retq
    462 ;
    463 ; AVX2-LABEL: combine_bitwise_ops_test5b:
    464 ; AVX2:       # BB#0:
    465 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
    466 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    467 ; AVX2-NEXT:    retq
    468   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    469   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    470   %or = or <4 x i32> %shuf1, %shuf2
    471   ret <4 x i32> %or
    472 }
    473 
    474 define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    475 ; SSE2-LABEL: combine_bitwise_ops_test6b:
    476 ; SSE2:       # BB#0:
    477 ; SSE2-NEXT:    xorps %xmm1, %xmm0
    478 ; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
    479 ; SSE2-NEXT:    retq
    480 ;
    481 ; SSSE3-LABEL: combine_bitwise_ops_test6b:
    482 ; SSSE3:       # BB#0:
    483 ; SSSE3-NEXT:    xorps %xmm1, %xmm0
    484 ; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
    485 ; SSSE3-NEXT:    retq
    486 ;
    487 ; SSE41-LABEL: combine_bitwise_ops_test6b:
    488 ; SSE41:       # BB#0:
    489 ; SSE41-NEXT:    pxor %xmm1, %xmm0
    490 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    491 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
    492 ; SSE41-NEXT:    retq
    493 ;
    494 ; AVX1-LABEL: combine_bitwise_ops_test6b:
    495 ; AVX1:       # BB#0:
    496 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    497 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    498 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
    499 ; AVX1-NEXT:    retq
    500 ;
    501 ; AVX2-LABEL: combine_bitwise_ops_test6b:
    502 ; AVX2:       # BB#0:
    503 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    504 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    505 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
    506 ; AVX2-NEXT:    retq
    507   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    508   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
    509   %xor = xor <4 x i32> %shuf1, %shuf2
    510   ret <4 x i32> %xor
    511 }
    512 
    513 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    514 ; SSE2-LABEL: combine_bitwise_ops_test1c:
    515 ; SSE2:       # BB#0:
    516 ; SSE2-NEXT:    pand %xmm1, %xmm0
    517 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    518 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    519 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    520 ; SSE2-NEXT:    retq
    521 ;
    522 ; SSSE3-LABEL: combine_bitwise_ops_test1c:
    523 ; SSSE3:       # BB#0:
    524 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    525 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    526 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    527 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    528 ; SSSE3-NEXT:    retq
    529 ;
    530 ; SSE41-LABEL: combine_bitwise_ops_test1c:
    531 ; SSE41:       # BB#0:
    532 ; SSE41-NEXT:    pand %xmm1, %xmm0
    533 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    534 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    535 ; SSE41-NEXT:    retq
    536 ;
    537 ; AVX1-LABEL: combine_bitwise_ops_test1c:
    538 ; AVX1:       # BB#0:
    539 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    540 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    541 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    542 ; AVX1-NEXT:    retq
    543 ;
    544 ; AVX2-LABEL: combine_bitwise_ops_test1c:
    545 ; AVX2:       # BB#0:
    546 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    547 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    548 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    549 ; AVX2-NEXT:    retq
    550   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    551   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    552   %and = and <4 x i32> %shuf1, %shuf2
    553   ret <4 x i32> %and
    554 }
    555 
    556 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    557 ; SSE2-LABEL: combine_bitwise_ops_test2c:
    558 ; SSE2:       # BB#0:
    559 ; SSE2-NEXT:    por %xmm1, %xmm0
    560 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    561 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    562 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    563 ; SSE2-NEXT:    retq
    564 ;
    565 ; SSSE3-LABEL: combine_bitwise_ops_test2c:
    566 ; SSSE3:       # BB#0:
    567 ; SSSE3-NEXT:    por %xmm1, %xmm0
    568 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    569 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
    570 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    571 ; SSSE3-NEXT:    retq
    572 ;
    573 ; SSE41-LABEL: combine_bitwise_ops_test2c:
    574 ; SSE41:       # BB#0:
    575 ; SSE41-NEXT:    por %xmm1, %xmm0
    576 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    577 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    578 ; SSE41-NEXT:    retq
    579 ;
    580 ; AVX1-LABEL: combine_bitwise_ops_test2c:
    581 ; AVX1:       # BB#0:
    582 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    583 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    584 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    585 ; AVX1-NEXT:    retq
    586 ;
    587 ; AVX2-LABEL: combine_bitwise_ops_test2c:
    588 ; AVX2:       # BB#0:
    589 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
    590 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    591 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    592 ; AVX2-NEXT:    retq
    593   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    594   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    595   %or = or <4 x i32> %shuf1, %shuf2
    596   ret <4 x i32> %or
    597 }
    598 
    599 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    600 ; SSE2-LABEL: combine_bitwise_ops_test3c:
    601 ; SSE2:       # BB#0:
    602 ; SSE2-NEXT:    pxor %xmm1, %xmm0
    603 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
    604 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    605 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
    606 ; SSE2-NEXT:    retq
    607 ;
    608 ; SSSE3-LABEL: combine_bitwise_ops_test3c:
    609 ; SSSE3:       # BB#0:
    610 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
    611 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
    612 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    613 ; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
    614 ; SSSE3-NEXT:    retq
    615 ;
    616 ; SSE41-LABEL: combine_bitwise_ops_test3c:
    617 ; SSE41:       # BB#0:
    618 ; SSE41-NEXT:    pxor %xmm1, %xmm0
    619 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    620 ; SSE41-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    621 ; SSE41-NEXT:    retq
    622 ;
    623 ; AVX-LABEL: combine_bitwise_ops_test3c:
    624 ; AVX:       # BB#0:
    625 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    626 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    627 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
    628 ; AVX-NEXT:    retq
    629   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    630   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    631   %xor = xor <4 x i32> %shuf1, %shuf2
    632   ret <4 x i32> %xor
    633 }
    634 
    635 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    636 ; SSE2-LABEL: combine_bitwise_ops_test4c:
    637 ; SSE2:       # BB#0:
    638 ; SSE2-NEXT:    pand %xmm1, %xmm0
    639 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    640 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    641 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    642 ; SSE2-NEXT:    retq
    643 ;
    644 ; SSSE3-LABEL: combine_bitwise_ops_test4c:
    645 ; SSSE3:       # BB#0:
    646 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    647 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    648 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    649 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    650 ; SSSE3-NEXT:    retq
    651 ;
    652 ; SSE41-LABEL: combine_bitwise_ops_test4c:
    653 ; SSE41:       # BB#0:
    654 ; SSE41-NEXT:    pand %xmm1, %xmm0
    655 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    656 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    657 ; SSE41-NEXT:    retq
    658 ;
    659 ; AVX1-LABEL: combine_bitwise_ops_test4c:
    660 ; AVX1:       # BB#0:
    661 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    662 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    663 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    664 ; AVX1-NEXT:    retq
    665 ;
    666 ; AVX2-LABEL: combine_bitwise_ops_test4c:
    667 ; AVX2:       # BB#0:
    668 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    669 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    670 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    671 ; AVX2-NEXT:    retq
    672   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    673   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    674   %and = and <4 x i32> %shuf1, %shuf2
    675   ret <4 x i32> %and
    676 }
    677 
    678 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    679 ; SSE2-LABEL: combine_bitwise_ops_test5c:
    680 ; SSE2:       # BB#0:
    681 ; SSE2-NEXT:    por %xmm1, %xmm0
    682 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    683 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    684 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    685 ; SSE2-NEXT:    retq
    686 ;
    687 ; SSSE3-LABEL: combine_bitwise_ops_test5c:
    688 ; SSSE3:       # BB#0:
    689 ; SSSE3-NEXT:    por %xmm1, %xmm0
    690 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    691 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    692 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    693 ; SSSE3-NEXT:    retq
    694 ;
    695 ; SSE41-LABEL: combine_bitwise_ops_test5c:
    696 ; SSE41:       # BB#0:
    697 ; SSE41-NEXT:    por %xmm1, %xmm0
    698 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    699 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    700 ; SSE41-NEXT:    retq
    701 ;
    702 ; AVX1-LABEL: combine_bitwise_ops_test5c:
    703 ; AVX1:       # BB#0:
    704 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    705 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    706 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    707 ; AVX1-NEXT:    retq
    708 ;
    709 ; AVX2-LABEL: combine_bitwise_ops_test5c:
    710 ; AVX2:       # BB#0:
    711 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
    712 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
    713 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
    714 ; AVX2-NEXT:    retq
    715   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    716   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    717   %or = or <4 x i32> %shuf1, %shuf2
    718   ret <4 x i32> %or
    719 }
    720 
    721 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
    722 ; SSE2-LABEL: combine_bitwise_ops_test6c:
    723 ; SSE2:       # BB#0:
    724 ; SSE2-NEXT:    pxor %xmm1, %xmm0
    725 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    726 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    727 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    728 ; SSE2-NEXT:    retq
    729 ;
    730 ; SSSE3-LABEL: combine_bitwise_ops_test6c:
    731 ; SSSE3:       # BB#0:
    732 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
    733 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
    734 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
    735 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    736 ; SSSE3-NEXT:    retq
    737 ;
    738 ; SSE41-LABEL: combine_bitwise_ops_test6c:
    739 ; SSE41:       # BB#0:
    740 ; SSE41-NEXT:    pxor %xmm1, %xmm0
    741 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
    742 ; SSE41-NEXT:    pxor %xmm0, %xmm0
    743 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    744 ; SSE41-NEXT:    retq
    745 ;
    746 ; AVX1-LABEL: combine_bitwise_ops_test6c:
    747 ; AVX1:       # BB#0:
    748 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    749 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    750 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    751 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
    752 ; AVX1-NEXT:    retq
    753 ;
    754 ; AVX2-LABEL: combine_bitwise_ops_test6c:
    755 ; AVX2:       # BB#0:
    756 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    757 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    758 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    759 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
    760 ; AVX2-NEXT:    retq
    761   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    762   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
    763   %xor = xor <4 x i32> %shuf1, %shuf2
    764   ret <4 x i32> %xor
    765 }
    766 
    767 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
    768 ; SSE-LABEL: combine_nested_undef_test1:
    769 ; SSE:       # BB#0:
    770 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
    771 ; SSE-NEXT:    retq
    772 ;
    773 ; AVX-LABEL: combine_nested_undef_test1:
    774 ; AVX:       # BB#0:
    775 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
    776 ; AVX-NEXT:    retq
    777   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
    778   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    779   ret <4 x i32> %2
    780 }
    781 
    782 define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
    783 ; SSE-LABEL: combine_nested_undef_test2:
    784 ; SSE:       # BB#0:
    785 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    786 ; SSE-NEXT:    retq
    787 ;
    788 ; AVX-LABEL: combine_nested_undef_test2:
    789 ; AVX:       # BB#0:
    790 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    791 ; AVX-NEXT:    retq
    792   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
    793   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    794   ret <4 x i32> %2
    795 }
    796 
    797 define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
    798 ; SSE-LABEL: combine_nested_undef_test3:
    799 ; SSE:       # BB#0:
    800 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    801 ; SSE-NEXT:    retq
    802 ;
    803 ; AVX-LABEL: combine_nested_undef_test3:
    804 ; AVX:       # BB#0:
    805 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
    806 ; AVX-NEXT:    retq
    807   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
    808   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
    809   ret <4 x i32> %2
    810 }
    811 
    812 define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
    813 ; SSE-LABEL: combine_nested_undef_test4:
    814 ; SSE:       # BB#0:
    815 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    816 ; SSE-NEXT:    retq
    817 ;
    818 ; AVX1-LABEL: combine_nested_undef_test4:
    819 ; AVX1:       # BB#0:
    820 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    821 ; AVX1-NEXT:    retq
    822 ;
    823 ; AVX2-LABEL: combine_nested_undef_test4:
    824 ; AVX2:       # BB#0:
    825 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
    826 ; AVX2-NEXT:    retq
    827   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
    828   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
    829   ret <4 x i32> %2
    830 }
    831 
    832 define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
    833 ; SSE-LABEL: combine_nested_undef_test5:
    834 ; SSE:       # BB#0:
    835 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
    836 ; SSE-NEXT:    retq
    837 ;
    838 ; AVX-LABEL: combine_nested_undef_test5:
    839 ; AVX:       # BB#0:
    840 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
    841 ; AVX-NEXT:    retq
    842   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
    843   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
    844   ret <4 x i32> %2
    845 }
    846 
    847 define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
    848 ; SSE-LABEL: combine_nested_undef_test6:
    849 ; SSE:       # BB#0:
    850 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    851 ; SSE-NEXT:    retq
    852 ;
    853 ; AVX-LABEL: combine_nested_undef_test6:
    854 ; AVX:       # BB#0:
    855 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    856 ; AVX-NEXT:    retq
    857   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
    858   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
    859   ret <4 x i32> %2
    860 }
    861 
    862 define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
    863 ; SSE-LABEL: combine_nested_undef_test7:
    864 ; SSE:       # BB#0:
    865 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
    866 ; SSE-NEXT:    retq
    867 ;
    868 ; AVX-LABEL: combine_nested_undef_test7:
    869 ; AVX:       # BB#0:
    870 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
    871 ; AVX-NEXT:    retq
    872   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    873   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
    874   ret <4 x i32> %2
    875 }
    876 
    877 define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
    878 ; SSE-LABEL: combine_nested_undef_test8:
    879 ; SSE:       # BB#0:
    880 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
    881 ; SSE-NEXT:    retq
    882 ;
    883 ; AVX-LABEL: combine_nested_undef_test8:
    884 ; AVX:       # BB#0:
    885 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
    886 ; AVX-NEXT:    retq
    887   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
    888   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
    889   ret <4 x i32> %2
    890 }
    891 
    892 define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
    893 ; SSE-LABEL: combine_nested_undef_test9:
    894 ; SSE:       # BB#0:
    895 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
    896 ; SSE-NEXT:    retq
    897 ;
    898 ; AVX-LABEL: combine_nested_undef_test9:
    899 ; AVX:       # BB#0:
    900 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
    901 ; AVX-NEXT:    retq
    902   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
    903   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
    904   ret <4 x i32> %2
    905 }
    906 
    907 define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
    908 ; SSE-LABEL: combine_nested_undef_test10:
    909 ; SSE:       # BB#0:
    910 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
    911 ; SSE-NEXT:    retq
    912 ;
    913 ; AVX-LABEL: combine_nested_undef_test10:
    914 ; AVX:       # BB#0:
    915 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
    916 ; AVX-NEXT:    retq
    917   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
    918   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
    919   ret <4 x i32> %2
    920 }
    921 
    922 define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
    923 ; SSE-LABEL: combine_nested_undef_test11:
    924 ; SSE:       # BB#0:
    925 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
    926 ; SSE-NEXT:    retq
    927 ;
    928 ; AVX-LABEL: combine_nested_undef_test11:
    929 ; AVX:       # BB#0:
    930 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
    931 ; AVX-NEXT:    retq
    932   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
    933   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
    934   ret <4 x i32> %2
    935 }
    936 
    937 define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
    938 ; SSE-LABEL: combine_nested_undef_test12:
    939 ; SSE:       # BB#0:
    940 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    941 ; SSE-NEXT:    retq
    942 ;
    943 ; AVX1-LABEL: combine_nested_undef_test12:
    944 ; AVX1:       # BB#0:
    945 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
    946 ; AVX1-NEXT:    retq
    947 ;
    948 ; AVX2-LABEL: combine_nested_undef_test12:
    949 ; AVX2:       # BB#0:
    950 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
    951 ; AVX2-NEXT:    retq
    952   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
    953   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
    954   ret <4 x i32> %2
    955 }
    956 
    957 ; The following pair of shuffles is folded into vector %A.
    958 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
    959 ; ALL-LABEL: combine_nested_undef_test13:
    960 ; ALL:       # BB#0:
    961 ; ALL-NEXT:    retq
    962   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
    963   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
    964   ret <4 x i32> %2
    965 }
    966 
    967 ; The following pair of shuffles is folded into vector %B.
    968 define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
    969 ; SSE-LABEL: combine_nested_undef_test14:
    970 ; SSE:       # BB#0:
    971 ; SSE-NEXT:    movaps %xmm1, %xmm0
    972 ; SSE-NEXT:    retq
    973 ;
    974 ; AVX-LABEL: combine_nested_undef_test14:
    975 ; AVX:       # BB#0:
    976 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
    977 ; AVX-NEXT:    retq
    978   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
    979   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
    980   ret <4 x i32> %2
    981 }
    982 
    983 
    984 ; Verify that we don't optimize the following cases. We expect more than one shuffle.
    985 ;
    986 ; FIXME: Many of these already don't make sense, and the rest should stop
    987 ; making sense with th enew vector shuffle lowering. Revisit at least testing for
    988 ; it.
    989 
    990 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
    991 ; SSE2-LABEL: combine_nested_undef_test15:
    992 ; SSE2:       # BB#0:
    993 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
    994 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
    995 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    996 ; SSE2-NEXT:    retq
    997 ;
    998 ; SSSE3-LABEL: combine_nested_undef_test15:
    999 ; SSSE3:       # BB#0:
   1000 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
   1001 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
   1002 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1003 ; SSSE3-NEXT:    retq
   1004 ;
   1005 ; SSE41-LABEL: combine_nested_undef_test15:
   1006 ; SSE41:       # BB#0:
   1007 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
   1008 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1009 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1010 ; SSE41-NEXT:    retq
   1011 ;
   1012 ; AVX1-LABEL: combine_nested_undef_test15:
   1013 ; AVX1:       # BB#0:
   1014 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
   1015 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1016 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1017 ; AVX1-NEXT:    retq
   1018 ;
   1019 ; AVX2-LABEL: combine_nested_undef_test15:
   1020 ; AVX2:       # BB#0:
   1021 ; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
   1022 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1023 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1024 ; AVX2-NEXT:    retq
   1025   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
   1026   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1027   ret <4 x i32> %2
   1028 }
   1029 
   1030 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
   1031 ; SSE2-LABEL: combine_nested_undef_test16:
   1032 ; SSE2:       # BB#0:
   1033 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
   1034 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
   1035 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1036 ; SSE2-NEXT:    retq
   1037 ;
   1038 ; SSSE3-LABEL: combine_nested_undef_test16:
   1039 ; SSSE3:       # BB#0:
   1040 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
   1041 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
   1042 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1043 ; SSSE3-NEXT:    retq
   1044 ;
   1045 ; SSE41-LABEL: combine_nested_undef_test16:
   1046 ; SSE41:       # BB#0:
   1047 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1048 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
   1049 ; SSE41-NEXT:    retq
   1050 ;
   1051 ; AVX1-LABEL: combine_nested_undef_test16:
   1052 ; AVX1:       # BB#0:
   1053 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1054 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
   1055 ; AVX1-NEXT:    retq
   1056 ;
   1057 ; AVX2-LABEL: combine_nested_undef_test16:
   1058 ; AVX2:       # BB#0:
   1059 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1060 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
   1061 ; AVX2-NEXT:    retq
   1062   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1063   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1064   ret <4 x i32> %2
   1065 }
   1066 
   1067 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
   1068 ; SSE2-LABEL: combine_nested_undef_test17:
   1069 ; SSE2:       # BB#0:
   1070 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
   1071 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
   1072 ; SSE2-NEXT:    retq
   1073 ;
   1074 ; SSSE3-LABEL: combine_nested_undef_test17:
   1075 ; SSSE3:       # BB#0:
   1076 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
   1077 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
   1078 ; SSSE3-NEXT:    retq
   1079 ;
   1080 ; SSE41-LABEL: combine_nested_undef_test17:
   1081 ; SSE41:       # BB#0:
   1082 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
   1083 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1084 ; SSE41-NEXT:    retq
   1085 ;
   1086 ; AVX1-LABEL: combine_nested_undef_test17:
   1087 ; AVX1:       # BB#0:
   1088 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
   1089 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1090 ; AVX1-NEXT:    retq
   1091 ;
   1092 ; AVX2-LABEL: combine_nested_undef_test17:
   1093 ; AVX2:       # BB#0:
   1094 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1095 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
   1096 ; AVX2-NEXT:    retq
   1097   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
   1098   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1099   ret <4 x i32> %2
   1100 }
   1101 
   1102 define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
   1103 ; SSE-LABEL: combine_nested_undef_test18:
   1104 ; SSE:       # BB#0:
   1105 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
   1106 ; SSE-NEXT:    retq
   1107 ;
   1108 ; AVX-LABEL: combine_nested_undef_test18:
   1109 ; AVX:       # BB#0:
   1110 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
   1111 ; AVX-NEXT:    retq
   1112   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
   1113   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   1114   ret <4 x i32> %2
   1115 }
   1116 
   1117 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
   1118 ; SSE2-LABEL: combine_nested_undef_test19:
   1119 ; SSE2:       # BB#0:
   1120 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1121 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
   1122 ; SSE2-NEXT:    retq
   1123 ;
   1124 ; SSSE3-LABEL: combine_nested_undef_test19:
   1125 ; SSSE3:       # BB#0:
   1126 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1127 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
   1128 ; SSSE3-NEXT:    retq
   1129 ;
   1130 ; SSE41-LABEL: combine_nested_undef_test19:
   1131 ; SSE41:       # BB#0:
   1132 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1133 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
   1134 ; SSE41-NEXT:    retq
   1135 ;
   1136 ; AVX1-LABEL: combine_nested_undef_test19:
   1137 ; AVX1:       # BB#0:
   1138 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   1139 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
   1140 ; AVX1-NEXT:    retq
   1141 ;
   1142 ; AVX2-LABEL: combine_nested_undef_test19:
   1143 ; AVX2:       # BB#0:
   1144 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1145 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
   1146 ; AVX2-NEXT:    retq
   1147   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   1148   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
   1149   ret <4 x i32> %2
   1150 }
   1151 
   1152 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
   1153 ; SSE2-LABEL: combine_nested_undef_test20:
   1154 ; SSE2:       # BB#0:
   1155 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
   1156 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
   1157 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1158 ; SSE2-NEXT:    retq
   1159 ;
   1160 ; SSSE3-LABEL: combine_nested_undef_test20:
   1161 ; SSSE3:       # BB#0:
   1162 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
   1163 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
   1164 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1165 ; SSSE3-NEXT:    retq
   1166 ;
   1167 ; SSE41-LABEL: combine_nested_undef_test20:
   1168 ; SSE41:       # BB#0:
   1169 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
   1170 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
   1171 ; SSE41-NEXT:    retq
   1172 ;
   1173 ; AVX1-LABEL: combine_nested_undef_test20:
   1174 ; AVX1:       # BB#0:
   1175 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
   1176 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
   1177 ; AVX1-NEXT:    retq
   1178 ;
   1179 ; AVX2-LABEL: combine_nested_undef_test20:
   1180 ; AVX2:       # BB#0:
   1181 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
   1182 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
   1183 ; AVX2-NEXT:    retq
   1184   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
   1185   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
   1186   ret <4 x i32> %2
   1187 }
   1188 
   1189 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
   1190 ; SSE2-LABEL: combine_nested_undef_test21:
   1191 ; SSE2:       # BB#0:
   1192 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1193 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
   1194 ; SSE2-NEXT:    retq
   1195 ;
   1196 ; SSSE3-LABEL: combine_nested_undef_test21:
   1197 ; SSSE3:       # BB#0:
   1198 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1199 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
   1200 ; SSSE3-NEXT:    retq
   1201 ;
   1202 ; SSE41-LABEL: combine_nested_undef_test21:
   1203 ; SSE41:       # BB#0:
   1204 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1205 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1206 ; SSE41-NEXT:    retq
   1207 ;
   1208 ; AVX1-LABEL: combine_nested_undef_test21:
   1209 ; AVX1:       # BB#0:
   1210 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1211 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1212 ; AVX1-NEXT:    retq
   1213 ;
   1214 ; AVX2-LABEL: combine_nested_undef_test21:
   1215 ; AVX2:       # BB#0:
   1216 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1217 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1218 ; AVX2-NEXT:    retq
   1219   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
   1220   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
   1221   ret <4 x i32> %2
   1222 }
   1223 
   1224 
   1225 ; Test that we correctly combine shuffles according to rule
   1226 ;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
   1227 
   1228 define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
   1229 ; SSE-LABEL: combine_nested_undef_test22:
   1230 ; SSE:       # BB#0:
   1231 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
   1232 ; SSE-NEXT:    retq
   1233 ;
   1234 ; AVX-LABEL: combine_nested_undef_test22:
   1235 ; AVX:       # BB#0:
   1236 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
   1237 ; AVX-NEXT:    retq
   1238   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
   1239   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
   1240   ret <4 x i32> %2
   1241 }
   1242 
   1243 define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
   1244 ; SSE-LABEL: combine_nested_undef_test23:
   1245 ; SSE:       # BB#0:
   1246 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
   1247 ; SSE-NEXT:    retq
   1248 ;
   1249 ; AVX-LABEL: combine_nested_undef_test23:
   1250 ; AVX:       # BB#0:
   1251 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
   1252 ; AVX-NEXT:    retq
   1253   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
   1254   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
   1255   ret <4 x i32> %2
   1256 }
   1257 
   1258 define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
   1259 ; SSE-LABEL: combine_nested_undef_test24:
   1260 ; SSE:       # BB#0:
   1261 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
   1262 ; SSE-NEXT:    retq
   1263 ;
   1264 ; AVX-LABEL: combine_nested_undef_test24:
   1265 ; AVX:       # BB#0:
   1266 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
   1267 ; AVX-NEXT:    retq
   1268   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1269   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
   1270   ret <4 x i32> %2
   1271 }
   1272 
   1273 define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
   1274 ; SSE-LABEL: combine_nested_undef_test25:
   1275 ; SSE:       # BB#0:
   1276 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1277 ; SSE-NEXT:    retq
   1278 ;
   1279 ; AVX1-LABEL: combine_nested_undef_test25:
   1280 ; AVX1:       # BB#0:
   1281 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1282 ; AVX1-NEXT:    retq
   1283 ;
   1284 ; AVX2-LABEL: combine_nested_undef_test25:
   1285 ; AVX2:       # BB#0:
   1286 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1287 ; AVX2-NEXT:    retq
   1288   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
   1289   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
   1290   ret <4 x i32> %2
   1291 }
   1292 
   1293 define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
   1294 ; SSE-LABEL: combine_nested_undef_test26:
   1295 ; SSE:       # BB#0:
   1296 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
   1297 ; SSE-NEXT:    retq
   1298 ;
   1299 ; AVX-LABEL: combine_nested_undef_test26:
   1300 ; AVX:       # BB#0:
   1301 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
   1302 ; AVX-NEXT:    retq
   1303   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
   1304   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
   1305   ret <4 x i32> %2
   1306 }
   1307 
   1308 define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
   1309 ; SSE-LABEL: combine_nested_undef_test27:
   1310 ; SSE:       # BB#0:
   1311 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1312 ; SSE-NEXT:    retq
   1313 ;
   1314 ; AVX1-LABEL: combine_nested_undef_test27:
   1315 ; AVX1:       # BB#0:
   1316 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1317 ; AVX1-NEXT:    retq
   1318 ;
   1319 ; AVX2-LABEL: combine_nested_undef_test27:
   1320 ; AVX2:       # BB#0:
   1321 ; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
   1322 ; AVX2-NEXT:    retq
   1323   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
   1324   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   1325   ret <4 x i32> %2
   1326 }
   1327 
   1328 define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
   1329 ; SSE-LABEL: combine_nested_undef_test28:
   1330 ; SSE:       # BB#0:
   1331 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
   1332 ; SSE-NEXT:    retq
   1333 ;
   1334 ; AVX-LABEL: combine_nested_undef_test28:
   1335 ; AVX:       # BB#0:
   1336 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
   1337 ; AVX-NEXT:    retq
   1338   %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   1339   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
   1340   ret <4 x i32> %2
   1341 }
   1342 
   1343 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
   1344 ; SSE-LABEL: combine_test1:
   1345 ; SSE:       # BB#0:
   1346 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1347 ; SSE-NEXT:    retq
   1348 ;
   1349 ; AVX-LABEL: combine_test1:
   1350 ; AVX:       # BB#0:
   1351 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
   1352 ; AVX-NEXT:    retq
   1353   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1354   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1355   ret <4 x float> %2
   1356 }
   1357 
   1358 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
   1359 ; SSE2-LABEL: combine_test2:
   1360 ; SSE2:       # BB#0:
   1361 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1362 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1363 ; SSE2-NEXT:    retq
   1364 ;
   1365 ; SSSE3-LABEL: combine_test2:
   1366 ; SSSE3:       # BB#0:
   1367 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1368 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1369 ; SSSE3-NEXT:    retq
   1370 ;
   1371 ; SSE41-LABEL: combine_test2:
   1372 ; SSE41:       # BB#0:
   1373 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1374 ; SSE41-NEXT:    retq
   1375 ;
   1376 ; AVX-LABEL: combine_test2:
   1377 ; AVX:       # BB#0:
   1378 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1379 ; AVX-NEXT:    retq
   1380   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1381   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1382   ret <4 x float> %2
   1383 }
   1384 
   1385 define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
   1386 ; SSE-LABEL: combine_test3:
   1387 ; SSE:       # BB#0:
   1388 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1389 ; SSE-NEXT:    retq
   1390 ;
   1391 ; AVX-LABEL: combine_test3:
   1392 ; AVX:       # BB#0:
   1393 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1394 ; AVX-NEXT:    retq
   1395   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   1396   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1397   ret <4 x float> %2
   1398 }
   1399 
   1400 define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
   1401 ; SSE-LABEL: combine_test4:
   1402 ; SSE:       # BB#0:
   1403 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   1404 ; SSE-NEXT:    movapd %xmm1, %xmm0
   1405 ; SSE-NEXT:    retq
   1406 ;
   1407 ; AVX-LABEL: combine_test4:
   1408 ; AVX:       # BB#0:
   1409 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1410 ; AVX-NEXT:    retq
   1411   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   1412   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1413   ret <4 x float> %2
   1414 }
   1415 
   1416 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
   1417 ; SSE2-LABEL: combine_test5:
   1418 ; SSE2:       # BB#0:
   1419 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1420 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1421 ; SSE2-NEXT:    retq
   1422 ;
   1423 ; SSSE3-LABEL: combine_test5:
   1424 ; SSSE3:       # BB#0:
   1425 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1426 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1427 ; SSSE3-NEXT:    retq
   1428 ;
   1429 ; SSE41-LABEL: combine_test5:
   1430 ; SSE41:       # BB#0:
   1431 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1432 ; SSE41-NEXT:    retq
   1433 ;
   1434 ; AVX-LABEL: combine_test5:
   1435 ; AVX:       # BB#0:
   1436 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1437 ; AVX-NEXT:    retq
   1438   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1439   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1440   ret <4 x float> %2
   1441 }
   1442 
   1443 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
   1444 ; SSE-LABEL: combine_test6:
   1445 ; SSE:       # BB#0:
   1446 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1447 ; SSE-NEXT:    retq
   1448 ;
   1449 ; AVX-LABEL: combine_test6:
   1450 ; AVX:       # BB#0:
   1451 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
   1452 ; AVX-NEXT:    retq
   1453   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1454   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1455   ret <4 x i32> %2
   1456 }
   1457 
   1458 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
   1459 ; SSE2-LABEL: combine_test7:
   1460 ; SSE2:       # BB#0:
   1461 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1462 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1463 ; SSE2-NEXT:    retq
   1464 ;
   1465 ; SSSE3-LABEL: combine_test7:
   1466 ; SSSE3:       # BB#0:
   1467 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1468 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1469 ; SSSE3-NEXT:    retq
   1470 ;
   1471 ; SSE41-LABEL: combine_test7:
   1472 ; SSE41:       # BB#0:
   1473 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1474 ; SSE41-NEXT:    retq
   1475 ;
   1476 ; AVX1-LABEL: combine_test7:
   1477 ; AVX1:       # BB#0:
   1478 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1479 ; AVX1-NEXT:    retq
   1480 ;
   1481 ; AVX2-LABEL: combine_test7:
   1482 ; AVX2:       # BB#0:
   1483 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1484 ; AVX2-NEXT:    retq
   1485   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1486   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1487   ret <4 x i32> %2
   1488 }
   1489 
   1490 define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
   1491 ; SSE-LABEL: combine_test8:
   1492 ; SSE:       # BB#0:
   1493 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1494 ; SSE-NEXT:    retq
   1495 ;
   1496 ; AVX-LABEL: combine_test8:
   1497 ; AVX:       # BB#0:
   1498 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1499 ; AVX-NEXT:    retq
   1500   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   1501   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1502   ret <4 x i32> %2
   1503 }
   1504 
   1505 define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
   1506 ; SSE-LABEL: combine_test9:
   1507 ; SSE:       # BB#0:
   1508 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   1509 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   1510 ; SSE-NEXT:    retq
   1511 ;
   1512 ; AVX-LABEL: combine_test9:
   1513 ; AVX:       # BB#0:
   1514 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1515 ; AVX-NEXT:    retq
   1516   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   1517   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1518   ret <4 x i32> %2
   1519 }
   1520 
   1521 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
   1522 ; SSE2-LABEL: combine_test10:
   1523 ; SSE2:       # BB#0:
   1524 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1525 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1526 ; SSE2-NEXT:    retq
   1527 ;
   1528 ; SSSE3-LABEL: combine_test10:
   1529 ; SSSE3:       # BB#0:
   1530 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1531 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1532 ; SSSE3-NEXT:    retq
   1533 ;
   1534 ; SSE41-LABEL: combine_test10:
   1535 ; SSE41:       # BB#0:
   1536 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1537 ; SSE41-NEXT:    retq
   1538 ;
   1539 ; AVX1-LABEL: combine_test10:
   1540 ; AVX1:       # BB#0:
   1541 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1542 ; AVX1-NEXT:    retq
   1543 ;
   1544 ; AVX2-LABEL: combine_test10:
   1545 ; AVX2:       # BB#0:
   1546 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1547 ; AVX2-NEXT:    retq
   1548   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1549   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   1550   ret <4 x i32> %2
   1551 }
   1552 
   1553 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
   1554 ; ALL-LABEL: combine_test11:
   1555 ; ALL:       # BB#0:
   1556 ; ALL-NEXT:    retq
   1557   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1558   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1559   ret <4 x float> %2
   1560 }
   1561 
   1562 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
   1563 ; SSE2-LABEL: combine_test12:
   1564 ; SSE2:       # BB#0:
   1565 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1566 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1567 ; SSE2-NEXT:    retq
   1568 ;
   1569 ; SSSE3-LABEL: combine_test12:
   1570 ; SSSE3:       # BB#0:
   1571 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1572 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1573 ; SSSE3-NEXT:    retq
   1574 ;
   1575 ; SSE41-LABEL: combine_test12:
   1576 ; SSE41:       # BB#0:
   1577 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1578 ; SSE41-NEXT:    retq
   1579 ;
   1580 ; AVX-LABEL: combine_test12:
   1581 ; AVX:       # BB#0:
   1582 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1583 ; AVX-NEXT:    retq
   1584   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1585   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   1586   ret <4 x float> %2
   1587 }
   1588 
   1589 define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
   1590 ; SSE-LABEL: combine_test13:
   1591 ; SSE:       # BB#0:
   1592 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1593 ; SSE-NEXT:    retq
   1594 ;
   1595 ; AVX-LABEL: combine_test13:
   1596 ; AVX:       # BB#0:
   1597 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1598 ; AVX-NEXT:    retq
   1599   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1600   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1601   ret <4 x float> %2
   1602 }
   1603 
   1604 define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
   1605 ; SSE-LABEL: combine_test14:
   1606 ; SSE:       # BB#0:
   1607 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1608 ; SSE-NEXT:    retq
   1609 ;
   1610 ; AVX-LABEL: combine_test14:
   1611 ; AVX:       # BB#0:
   1612 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1613 ; AVX-NEXT:    retq
   1614   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
   1615   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1616   ret <4 x float> %2
   1617 }
   1618 
   1619 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
   1620 ; SSE2-LABEL: combine_test15:
   1621 ; SSE2:       # BB#0:
   1622 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1623 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1624 ; SSE2-NEXT:    retq
   1625 ;
   1626 ; SSSE3-LABEL: combine_test15:
   1627 ; SSSE3:       # BB#0:
   1628 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1629 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1630 ; SSSE3-NEXT:    retq
   1631 ;
   1632 ; SSE41-LABEL: combine_test15:
   1633 ; SSE41:       # BB#0:
   1634 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1635 ; SSE41-NEXT:    retq
   1636 ;
   1637 ; AVX-LABEL: combine_test15:
   1638 ; AVX:       # BB#0:
   1639 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1640 ; AVX-NEXT:    retq
   1641   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1642   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   1643   ret <4 x float> %2
   1644 }
   1645 
   1646 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
   1647 ; ALL-LABEL: combine_test16:
   1648 ; ALL:       # BB#0:
   1649 ; ALL-NEXT:    retq
   1650   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1651   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1652   ret <4 x i32> %2
   1653 }
   1654 
   1655 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
   1656 ; SSE2-LABEL: combine_test17:
   1657 ; SSE2:       # BB#0:
   1658 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1659 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1660 ; SSE2-NEXT:    retq
   1661 ;
   1662 ; SSSE3-LABEL: combine_test17:
   1663 ; SSSE3:       # BB#0:
   1664 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1665 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   1666 ; SSSE3-NEXT:    retq
   1667 ;
   1668 ; SSE41-LABEL: combine_test17:
   1669 ; SSE41:       # BB#0:
   1670 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1671 ; SSE41-NEXT:    retq
   1672 ;
   1673 ; AVX1-LABEL: combine_test17:
   1674 ; AVX1:       # BB#0:
   1675 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1676 ; AVX1-NEXT:    retq
   1677 ;
   1678 ; AVX2-LABEL: combine_test17:
   1679 ; AVX2:       # BB#0:
   1680 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1681 ; AVX2-NEXT:    retq
   1682   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1683   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   1684   ret <4 x i32> %2
   1685 }
   1686 
   1687 define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
   1688 ; SSE-LABEL: combine_test18:
   1689 ; SSE:       # BB#0:
   1690 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1691 ; SSE-NEXT:    retq
   1692 ;
   1693 ; AVX-LABEL: combine_test18:
   1694 ; AVX:       # BB#0:
   1695 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1696 ; AVX-NEXT:    retq
   1697   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1698   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1699   ret <4 x i32> %2
   1700 }
   1701 
   1702 define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
   1703 ; SSE-LABEL: combine_test19:
   1704 ; SSE:       # BB#0:
   1705 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1706 ; SSE-NEXT:    retq
   1707 ;
   1708 ; AVX-LABEL: combine_test19:
   1709 ; AVX:       # BB#0:
   1710 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1711 ; AVX-NEXT:    retq
   1712   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
   1713   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   1714   ret <4 x i32> %2
   1715 }
   1716 
   1717 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
   1718 ; SSE2-LABEL: combine_test20:
   1719 ; SSE2:       # BB#0:
   1720 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1721 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1722 ; SSE2-NEXT:    retq
   1723 ;
   1724 ; SSSE3-LABEL: combine_test20:
   1725 ; SSSE3:       # BB#0:
   1726 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   1727 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   1728 ; SSSE3-NEXT:    retq
   1729 ;
   1730 ; SSE41-LABEL: combine_test20:
   1731 ; SSE41:       # BB#0:
   1732 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1733 ; SSE41-NEXT:    retq
   1734 ;
   1735 ; AVX1-LABEL: combine_test20:
   1736 ; AVX1:       # BB#0:
   1737 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   1738 ; AVX1-NEXT:    retq
   1739 ;
   1740 ; AVX2-LABEL: combine_test20:
   1741 ; AVX2:       # BB#0:
   1742 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   1743 ; AVX2-NEXT:    retq
   1744   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
   1745   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   1746   ret <4 x i32> %2
   1747 }
   1748 
   1749 define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
   1750 ; SSE-LABEL: combine_test21:
   1751 ; SSE:       # BB#0:
   1752 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   1753 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
   1754 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1755 ; SSE-NEXT:    movdqa %xmm2, (%rdi)
   1756 ; SSE-NEXT:    retq
   1757 ;
   1758 ; AVX1-LABEL: combine_test21:
   1759 ; AVX1:       # BB#0:
   1760 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1761 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
   1762 ; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1763 ; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
   1764 ; AVX1-NEXT:    vzeroupper
   1765 ; AVX1-NEXT:    retq
   1766 ;
   1767 ; AVX2-LABEL: combine_test21:
   1768 ; AVX2:       # BB#0:
   1769 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1770 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
   1771 ; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1772 ; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
   1773 ; AVX2-NEXT:    vzeroupper
   1774 ; AVX2-NEXT:    retq
   1775   %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1776   %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1777   store <4 x i32> %1, <4 x i32>* %ptr, align 16
   1778   ret <4 x i32> %2
   1779 }
   1780 
   1781 define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
   1782 ; SSE-LABEL: combine_test22:
   1783 ; SSE:       # BB#0:
   1784 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   1785 ; SSE-NEXT:    movhpd (%rsi), %xmm0
   1786 ; SSE-NEXT:    retq
   1787 ;
   1788 ; AVX-LABEL: combine_test22:
   1789 ; AVX:       # BB#0:
   1790 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
   1791 ; AVX-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
   1792 ; AVX-NEXT:    retq
   1793 ; Current AVX2 lowering of this is still awful, not adding a test case.
   1794   %1 = load <2 x float>, <2 x float>* %a, align 8
   1795   %2 = load <2 x float>, <2 x float>* %b, align 8
   1796   %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   1797   ret <8 x float> %3
   1798 }
   1799 
   1800 ; Check some negative cases.
   1801 ; FIXME: Do any of these really make sense? Are they redundant with the above tests?
   1802 
   1803 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
   1804 ; SSE-LABEL: combine_test1b:
   1805 ; SSE:       # BB#0:
   1806 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
   1807 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1808 ; SSE-NEXT:    retq
   1809 ;
   1810 ; AVX-LABEL: combine_test1b:
   1811 ; AVX:       # BB#0:
   1812 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
   1813 ; AVX-NEXT:    retq
   1814   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1815   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
   1816   ret <4 x float> %2
   1817 }
   1818 
   1819 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
   1820 ; SSE2-LABEL: combine_test2b:
   1821 ; SSE2:       # BB#0:
   1822 ; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
   1823 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   1824 ; SSE2-NEXT:    retq
   1825 ;
   1826 ; SSSE3-LABEL: combine_test2b:
   1827 ; SSSE3:       # BB#0:
   1828 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
   1829 ; SSSE3-NEXT:    retq
   1830 ;
   1831 ; SSE41-LABEL: combine_test2b:
   1832 ; SSE41:       # BB#0:
   1833 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
   1834 ; SSE41-NEXT:    retq
   1835 ;
   1836 ; AVX-LABEL: combine_test2b:
   1837 ; AVX:       # BB#0:
   1838 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
   1839 ; AVX-NEXT:    retq
   1840   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1841   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
   1842   ret <4 x float> %2
   1843 }
   1844 
   1845 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
   1846 ; SSE2-LABEL: combine_test3b:
   1847 ; SSE2:       # BB#0:
   1848 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
   1849 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
   1850 ; SSE2-NEXT:    retq
   1851 ;
   1852 ; SSSE3-LABEL: combine_test3b:
   1853 ; SSSE3:       # BB#0:
   1854 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
   1855 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
   1856 ; SSSE3-NEXT:    retq
   1857 ;
   1858 ; SSE41-LABEL: combine_test3b:
   1859 ; SSE41:       # BB#0:
   1860 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   1861 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
   1862 ; SSE41-NEXT:    retq
   1863 ;
   1864 ; AVX-LABEL: combine_test3b:
   1865 ; AVX:       # BB#0:
   1866 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   1867 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
   1868 ; AVX-NEXT:    retq
   1869   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
   1870   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
   1871   ret <4 x float> %2
   1872 }
   1873 
   1874 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
   1875 ; SSE-LABEL: combine_test4b:
   1876 ; SSE:       # BB#0:
   1877 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
   1878 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1879 ; SSE-NEXT:    retq
   1880 ;
   1881 ; AVX-LABEL: combine_test4b:
   1882 ; AVX:       # BB#0:
   1883 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1884 ; AVX-NEXT:    retq
   1885   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1886   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
   1887   ret <4 x float> %2
   1888 }
   1889 
   1890 
   1891 ; Verify that we correctly fold shuffles even when we use illegal vector types.
   1892 
   1893 define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
   1894 ; SSE2-LABEL: combine_test1c:
   1895 ; SSE2:       # BB#0:
   1896 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1897 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1898 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1899 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1900 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1901 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1902 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1903 ; SSE2-NEXT:    retq
   1904 ;
   1905 ; SSSE3-LABEL: combine_test1c:
   1906 ; SSSE3:       # BB#0:
   1907 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1908 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1909 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1910 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1911 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1912 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1913 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1914 ; SSSE3-NEXT:    retq
   1915 ;
   1916 ; SSE41-LABEL: combine_test1c:
   1917 ; SSE41:       # BB#0:
   1918 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1919 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1920 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
   1921 ; SSE41-NEXT:    retq
   1922 ;
   1923 ; AVX1-LABEL: combine_test1c:
   1924 ; AVX1:       # BB#0:
   1925 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1926 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1927 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1928 ; AVX1-NEXT:    retq
   1929 ;
   1930 ; AVX2-LABEL: combine_test1c:
   1931 ; AVX2:       # BB#0:
   1932 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1933 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1934 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1935 ; AVX2-NEXT:    retq
   1936   %A = load <4 x i8>, <4 x i8>* %a
   1937   %B = load <4 x i8>, <4 x i8>* %b
   1938   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1939   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   1940   ret <4 x i8> %2
   1941 }
   1942 
   1943 define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
   1944 ; SSE2-LABEL: combine_test2c:
   1945 ; SSE2:       # BB#0:
   1946 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1947 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1948 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1949 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1950 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1951 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1952 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1953 ; SSE2-NEXT:    retq
   1954 ;
   1955 ; SSSE3-LABEL: combine_test2c:
   1956 ; SSSE3:       # BB#0:
   1957 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1958 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1959 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1960 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1961 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1962 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1963 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1964 ; SSSE3-NEXT:    retq
   1965 ;
   1966 ; SSE41-LABEL: combine_test2c:
   1967 ; SSE41:       # BB#0:
   1968 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1969 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1970 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1971 ; SSE41-NEXT:    retq
   1972 ;
   1973 ; AVX-LABEL: combine_test2c:
   1974 ; AVX:       # BB#0:
   1975 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1976 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   1977 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1978 ; AVX-NEXT:    retq
   1979   %A = load <4 x i8>, <4 x i8>* %a
   1980   %B = load <4 x i8>, <4 x i8>* %b
   1981   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
   1982   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   1983   ret <4 x i8> %2
   1984 }
   1985 
   1986 define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
   1987 ; SSE2-LABEL: combine_test3c:
   1988 ; SSE2:       # BB#0:
   1989 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1990 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1991 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1992 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1993 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1994 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1995 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1996 ; SSE2-NEXT:    retq
   1997 ;
   1998 ; SSSE3-LABEL: combine_test3c:
   1999 ; SSSE3:       # BB#0:
   2000 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2001 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2002 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2003 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2004 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2005 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2006 ; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   2007 ; SSSE3-NEXT:    retq
   2008 ;
   2009 ; SSE41-LABEL: combine_test3c:
   2010 ; SSE41:       # BB#0:
   2011 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2012 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2013 ; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   2014 ; SSE41-NEXT:    retq
   2015 ;
   2016 ; AVX-LABEL: combine_test3c:
   2017 ; AVX:       # BB#0:
   2018 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2019 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2020 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2021 ; AVX-NEXT:    retq
   2022   %A = load <4 x i8>, <4 x i8>* %a
   2023   %B = load <4 x i8>, <4 x i8>* %b
   2024   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2025   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   2026   ret <4 x i8> %2
   2027 }
   2028 
   2029 define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
   2030 ; SSE2-LABEL: combine_test4c:
   2031 ; SSE2:       # BB#0:
   2032 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2033 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2034 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2035 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2036 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2037 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2038 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   2039 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   2040 ; SSE2-NEXT:    retq
   2041 ;
   2042 ; SSSE3-LABEL: combine_test4c:
   2043 ; SSSE3:       # BB#0:
   2044 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2045 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2046 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2047 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2048 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2049 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2050 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
   2051 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
   2052 ; SSSE3-NEXT:    retq
   2053 ;
   2054 ; SSE41-LABEL: combine_test4c:
   2055 ; SSE41:       # BB#0:
   2056 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2057 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2058 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
   2059 ; SSE41-NEXT:    retq
   2060 ;
   2061 ; AVX1-LABEL: combine_test4c:
   2062 ; AVX1:       # BB#0:
   2063 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2064 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2065 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
   2066 ; AVX1-NEXT:    retq
   2067 ;
   2068 ; AVX2-LABEL: combine_test4c:
   2069 ; AVX2:       # BB#0:
   2070 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2071 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2072 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
   2073 ; AVX2-NEXT:    retq
   2074   %A = load <4 x i8>, <4 x i8>* %a
   2075   %B = load <4 x i8>, <4 x i8>* %b
   2076   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   2077   %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   2078   ret <4 x i8> %2
   2079 }
   2080 
   2081 
   2082 ; The following test cases are generated from this C++ code
   2083 ;
   2084 ;__m128 blend_01(__m128 a, __m128 b)
   2085 ;{
   2086 ;  __m128 s = a;
   2087 ;  s = _mm_blend_ps( s, b, 1<<0 );
   2088 ;  s = _mm_blend_ps( s, b, 1<<1 );
   2089 ;  return s;
   2090 ;}
   2091 ;
   2092 ;__m128 blend_02(__m128 a, __m128 b)
   2093 ;{
   2094 ;  __m128 s = a;
   2095 ;  s = _mm_blend_ps( s, b, 1<<0 );
   2096 ;  s = _mm_blend_ps( s, b, 1<<2 );
   2097 ;  return s;
   2098 ;}
   2099 ;
   2100 ;__m128 blend_123(__m128 a, __m128 b)
   2101 ;{
   2102 ;  __m128 s = a;
   2103 ;  s = _mm_blend_ps( s, b, 1<<1 );
   2104 ;  s = _mm_blend_ps( s, b, 1<<2 );
   2105 ;  s = _mm_blend_ps( s, b, 1<<3 );
   2106 ;  return s;
   2107 ;}
   2108 
   2109 ; Ideally, we should collapse the following shuffles into a single one.
   2110 
   2111 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
   2112 ; SSE2-LABEL: combine_blend_01:
   2113 ; SSE2:       # BB#0:
   2114 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2115 ; SSE2-NEXT:    retq
   2116 ;
   2117 ; SSSE3-LABEL: combine_blend_01:
   2118 ; SSSE3:       # BB#0:
   2119 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2120 ; SSSE3-NEXT:    retq
   2121 ;
   2122 ; SSE41-LABEL: combine_blend_01:
   2123 ; SSE41:       # BB#0:
   2124 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2125 ; SSE41-NEXT:    retq
   2126 ;
   2127 ; AVX-LABEL: combine_blend_01:
   2128 ; AVX:       # BB#0:
   2129 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2130 ; AVX-NEXT:    retq
   2131   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
   2132   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
   2133   ret <4 x float> %shuffle6
   2134 }
   2135 
   2136 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
   2137 ; SSE2-LABEL: combine_blend_02:
   2138 ; SSE2:       # BB#0:
   2139 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
   2140 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
   2141 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2142 ; SSE2-NEXT:    retq
   2143 ;
   2144 ; SSSE3-LABEL: combine_blend_02:
   2145 ; SSSE3:       # BB#0:
   2146 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
   2147 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
   2148 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2149 ; SSSE3-NEXT:    retq
   2150 ;
   2151 ; SSE41-LABEL: combine_blend_02:
   2152 ; SSE41:       # BB#0:
   2153 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   2154 ; SSE41-NEXT:    retq
   2155 ;
   2156 ; AVX-LABEL: combine_blend_02:
   2157 ; AVX:       # BB#0:
   2158 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
   2159 ; AVX-NEXT:    retq
   2160   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
   2161   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
   2162   ret <4 x float> %shuffle6
   2163 }
   2164 
   2165 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
   2166 ; SSE2-LABEL: combine_blend_123:
   2167 ; SSE2:       # BB#0:
   2168 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   2169 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2170 ; SSE2-NEXT:    retq
   2171 ;
   2172 ; SSSE3-LABEL: combine_blend_123:
   2173 ; SSSE3:       # BB#0:
   2174 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   2175 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2176 ; SSSE3-NEXT:    retq
   2177 ;
   2178 ; SSE41-LABEL: combine_blend_123:
   2179 ; SSE41:       # BB#0:
   2180 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   2181 ; SSE41-NEXT:    retq
   2182 ;
   2183 ; AVX-LABEL: combine_blend_123:
   2184 ; AVX:       # BB#0:
   2185 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   2186 ; AVX-NEXT:    retq
   2187   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
   2188   %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
   2189   %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   2190   ret <4 x float> %shuffle12
   2191 }
   2192 
   2193 define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
   2194 ; SSE-LABEL: combine_test_movhl_1:
   2195 ; SSE:       # BB#0:
   2196 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2197 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   2198 ; SSE-NEXT:    retq
   2199 ;
   2200 ; AVX-LABEL: combine_test_movhl_1:
   2201 ; AVX:       # BB#0:
   2202 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2203 ; AVX-NEXT:    retq
   2204   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
   2205   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
   2206   ret <4 x i32> %2
   2207 }
   2208 
   2209 define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
   2210 ; SSE-LABEL: combine_test_movhl_2:
   2211 ; SSE:       # BB#0:
   2212 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2213 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   2214 ; SSE-NEXT:    retq
   2215 ;
   2216 ; AVX-LABEL: combine_test_movhl_2:
   2217 ; AVX:       # BB#0:
   2218 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2219 ; AVX-NEXT:    retq
   2220   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
   2221   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
   2222   ret <4 x i32> %2
   2223 }
   2224 
   2225 define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
   2226 ; SSE-LABEL: combine_test_movhl_3:
   2227 ; SSE:       # BB#0:
   2228 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2229 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   2230 ; SSE-NEXT:    retq
   2231 ;
   2232 ; AVX-LABEL: combine_test_movhl_3:
   2233 ; AVX:       # BB#0:
   2234 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2235 ; AVX-NEXT:    retq
   2236   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
   2237   %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
   2238   ret <4 x i32> %2
   2239 }
   2240 
   2241 
   2242 ; Verify that we fold shuffles according to rule:
   2243 ;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
   2244 
   2245 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
   2246 ; SSE2-LABEL: combine_undef_input_test1:
   2247 ; SSE2:       # BB#0:
   2248 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2249 ; SSE2-NEXT:    retq
   2250 ;
   2251 ; SSSE3-LABEL: combine_undef_input_test1:
   2252 ; SSSE3:       # BB#0:
   2253 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2254 ; SSSE3-NEXT:    retq
   2255 ;
   2256 ; SSE41-LABEL: combine_undef_input_test1:
   2257 ; SSE41:       # BB#0:
   2258 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2259 ; SSE41-NEXT:    retq
   2260 ;
   2261 ; AVX-LABEL: combine_undef_input_test1:
   2262 ; AVX:       # BB#0:
   2263 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2264 ; AVX-NEXT:    retq
   2265   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2266   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
   2267   ret <4 x float> %2
   2268 }
   2269 
   2270 define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
   2271 ; SSE-LABEL: combine_undef_input_test2:
   2272 ; SSE:       # BB#0:
   2273 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2274 ; SSE-NEXT:    retq
   2275 ;
   2276 ; AVX-LABEL: combine_undef_input_test2:
   2277 ; AVX:       # BB#0:
   2278 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2279 ; AVX-NEXT:    retq
   2280   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2281   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   2282   ret <4 x float> %2
   2283 }
   2284 
   2285 define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
   2286 ; SSE-LABEL: combine_undef_input_test3:
   2287 ; SSE:       # BB#0:
   2288 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2289 ; SSE-NEXT:    retq
   2290 ;
   2291 ; AVX-LABEL: combine_undef_input_test3:
   2292 ; AVX:       # BB#0:
   2293 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2294 ; AVX-NEXT:    retq
   2295   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2296   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   2297   ret <4 x float> %2
   2298 }
   2299 
   2300 define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
   2301 ; SSE-LABEL: combine_undef_input_test4:
   2302 ; SSE:       # BB#0:
   2303 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2304 ; SSE-NEXT:    movapd %xmm1, %xmm0
   2305 ; SSE-NEXT:    retq
   2306 ;
   2307 ; AVX-LABEL: combine_undef_input_test4:
   2308 ; AVX:       # BB#0:
   2309 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2310 ; AVX-NEXT:    retq
   2311   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2312   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   2313   ret <4 x float> %2
   2314 }
   2315 
   2316 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
   2317 ; SSE2-LABEL: combine_undef_input_test5:
   2318 ; SSE2:       # BB#0:
   2319 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2320 ; SSE2-NEXT:    movapd %xmm1, %xmm0
   2321 ; SSE2-NEXT:    retq
   2322 ;
   2323 ; SSSE3-LABEL: combine_undef_input_test5:
   2324 ; SSSE3:       # BB#0:
   2325 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2326 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
   2327 ; SSSE3-NEXT:    retq
   2328 ;
   2329 ; SSE41-LABEL: combine_undef_input_test5:
   2330 ; SSE41:       # BB#0:
   2331 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   2332 ; SSE41-NEXT:    retq
   2333 ;
   2334 ; AVX-LABEL: combine_undef_input_test5:
   2335 ; AVX:       # BB#0:
   2336 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   2337 ; AVX-NEXT:    retq
   2338   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2339   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
   2340   ret <4 x float> %2
   2341 }
   2342 
   2343 
   2344 ; Verify that we fold shuffles according to rule:
   2345 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
   2346 
   2347 define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
   2348 ; ALL-LABEL: combine_undef_input_test6:
   2349 ; ALL:       # BB#0:
   2350 ; ALL-NEXT:    retq
   2351   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2352   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
   2353   ret <4 x float> %2
   2354 }
   2355 
   2356 define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
   2357 ; SSE2-LABEL: combine_undef_input_test7:
   2358 ; SSE2:       # BB#0:
   2359 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2360 ; SSE2-NEXT:    retq
   2361 ;
   2362 ; SSSE3-LABEL: combine_undef_input_test7:
   2363 ; SSSE3:       # BB#0:
   2364 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2365 ; SSSE3-NEXT:    retq
   2366 ;
   2367 ; SSE41-LABEL: combine_undef_input_test7:
   2368 ; SSE41:       # BB#0:
   2369 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2370 ; SSE41-NEXT:    retq
   2371 ;
   2372 ; AVX-LABEL: combine_undef_input_test7:
   2373 ; AVX:       # BB#0:
   2374 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2375 ; AVX-NEXT:    retq
   2376   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2377   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
   2378   ret <4 x float> %2
   2379 }
   2380 
   2381 define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
   2382 ; SSE2-LABEL: combine_undef_input_test8:
   2383 ; SSE2:       # BB#0:
   2384 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2385 ; SSE2-NEXT:    retq
   2386 ;
   2387 ; SSSE3-LABEL: combine_undef_input_test8:
   2388 ; SSSE3:       # BB#0:
   2389 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2390 ; SSSE3-NEXT:    retq
   2391 ;
   2392 ; SSE41-LABEL: combine_undef_input_test8:
   2393 ; SSE41:       # BB#0:
   2394 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2395 ; SSE41-NEXT:    retq
   2396 ;
   2397 ; AVX-LABEL: combine_undef_input_test8:
   2398 ; AVX:       # BB#0:
   2399 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2400 ; AVX-NEXT:    retq
   2401   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2402   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
   2403   ret <4 x float> %2
   2404 }
   2405 
   2406 define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
   2407 ; SSE-LABEL: combine_undef_input_test9:
   2408 ; SSE:       # BB#0:
   2409 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   2410 ; SSE-NEXT:    retq
   2411 ;
   2412 ; AVX-LABEL: combine_undef_input_test9:
   2413 ; AVX:       # BB#0:
   2414 ; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
   2415 ; AVX-NEXT:    retq
   2416   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2417   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   2418   ret <4 x float> %2
   2419 }
   2420 
   2421 define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
   2422 ; ALL-LABEL: combine_undef_input_test10:
   2423 ; ALL:       # BB#0:
   2424 ; ALL-NEXT:    retq
   2425   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2426   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
   2427   ret <4 x float> %2
   2428 }
   2429 
   2430 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
   2431 ; SSE2-LABEL: combine_undef_input_test11:
   2432 ; SSE2:       # BB#0:
   2433 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2434 ; SSE2-NEXT:    retq
   2435 ;
   2436 ; SSSE3-LABEL: combine_undef_input_test11:
   2437 ; SSSE3:       # BB#0:
   2438 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2439 ; SSSE3-NEXT:    retq
   2440 ;
   2441 ; SSE41-LABEL: combine_undef_input_test11:
   2442 ; SSE41:       # BB#0:
   2443 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2444 ; SSE41-NEXT:    retq
   2445 ;
   2446 ; AVX-LABEL: combine_undef_input_test11:
   2447 ; AVX:       # BB#0:
   2448 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   2449 ; AVX-NEXT:    retq
   2450   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2451   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
   2452   ret <4 x float> %2
   2453 }
   2454 
   2455 define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
   2456 ; SSE-LABEL: combine_undef_input_test12:
   2457 ; SSE:       # BB#0:
   2458 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2459 ; SSE-NEXT:    retq
   2460 ;
   2461 ; AVX-LABEL: combine_undef_input_test12:
   2462 ; AVX:       # BB#0:
   2463 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2464 ; AVX-NEXT:    retq
   2465   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2466   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
   2467   ret <4 x float> %2
   2468 }
   2469 
   2470 define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
   2471 ; SSE-LABEL: combine_undef_input_test13:
   2472 ; SSE:       # BB#0:
   2473 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2474 ; SSE-NEXT:    retq
   2475 ;
   2476 ; AVX-LABEL: combine_undef_input_test13:
   2477 ; AVX:       # BB#0:
   2478 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2479 ; AVX-NEXT:    retq
   2480   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2481   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
   2482   ret <4 x float> %2
   2483 }
   2484 
   2485 define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
   2486 ; SSE-LABEL: combine_undef_input_test14:
   2487 ; SSE:       # BB#0:
   2488 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
   2489 ; SSE-NEXT:    movapd %xmm1, %xmm0
   2490 ; SSE-NEXT:    retq
   2491 ;
   2492 ; AVX-LABEL: combine_undef_input_test14:
   2493 ; AVX:       # BB#0:
   2494 ; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   2495 ; AVX-NEXT:    retq
   2496   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2497   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   2498   ret <4 x float> %2
   2499 }
   2500 
   2501 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
   2502 ; SSE2-LABEL: combine_undef_input_test15:
   2503 ; SSE2:       # BB#0:
   2504 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2505 ; SSE2-NEXT:    movapd %xmm1, %xmm0
   2506 ; SSE2-NEXT:    retq
   2507 ;
   2508 ; SSSE3-LABEL: combine_undef_input_test15:
   2509 ; SSSE3:       # BB#0:
   2510 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   2511 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
   2512 ; SSSE3-NEXT:    retq
   2513 ;
   2514 ; SSE41-LABEL: combine_undef_input_test15:
   2515 ; SSE41:       # BB#0:
   2516 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   2517 ; SSE41-NEXT:    retq
   2518 ;
   2519 ; AVX-LABEL: combine_undef_input_test15:
   2520 ; AVX:       # BB#0:
   2521 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
   2522 ; AVX-NEXT:    retq
   2523   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2524   %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2525   ret <4 x float> %2
   2526 }
   2527 
   2528 
   2529 ; Verify that shuffles are canonicalized according to rules:
   2530 ;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
   2531 ;
   2532 ; This allows to trigger the following combine rule:
   2533 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
   2534 ;
   2535 ; As a result, all the shuffle pairs in each function below should be
   2536 ; combined into a single legal shuffle operation.
   2537 
   2538 define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
   2539 ; ALL-LABEL: combine_undef_input_test16:
   2540 ; ALL:       # BB#0:
   2541 ; ALL-NEXT:    retq
   2542   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   2543   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
   2544   ret <4 x float> %2
   2545 }
   2546 
   2547 define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
   2548 ; SSE2-LABEL: combine_undef_input_test17:
   2549 ; SSE2:       # BB#0:
   2550 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2551 ; SSE2-NEXT:    retq
   2552 ;
   2553 ; SSSE3-LABEL: combine_undef_input_test17:
   2554 ; SSSE3:       # BB#0:
   2555 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2556 ; SSSE3-NEXT:    retq
   2557 ;
   2558 ; SSE41-LABEL: combine_undef_input_test17:
   2559 ; SSE41:       # BB#0:
   2560 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2561 ; SSE41-NEXT:    retq
   2562 ;
   2563 ; AVX-LABEL: combine_undef_input_test17:
   2564 ; AVX:       # BB#0:
   2565 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2566 ; AVX-NEXT:    retq
   2567   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
   2568   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
   2569   ret <4 x float> %2
   2570 }
   2571 
   2572 define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
   2573 ; SSE2-LABEL: combine_undef_input_test18:
   2574 ; SSE2:       # BB#0:
   2575 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   2576 ; SSE2-NEXT:    retq
   2577 ;
   2578 ; SSSE3-LABEL: combine_undef_input_test18:
   2579 ; SSSE3:       # BB#0:
   2580 ; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2581 ; SSSE3-NEXT:    retq
   2582 ;
   2583 ; SSE41-LABEL: combine_undef_input_test18:
   2584 ; SSE41:       # BB#0:
   2585 ; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   2586 ; SSE41-NEXT:    retq
   2587 ;
   2588 ; AVX-LABEL: combine_undef_input_test18:
   2589 ; AVX:       # BB#0:
   2590 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2591 ; AVX-NEXT:    retq
   2592   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   2593   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   2594   ret <4 x float> %2
   2595 }
   2596 
   2597 define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
   2598 ; SSE-LABEL: combine_undef_input_test19:
   2599 ; SSE:       # BB#0:
   2600 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   2601 ; SSE-NEXT:    retq
   2602 ;
   2603 ; AVX-LABEL: combine_undef_input_test19:
   2604 ; AVX:       # BB#0:
   2605 ; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
   2606 ; AVX-NEXT:    retq
   2607   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
   2608   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   2609   ret <4 x float> %2
   2610 }
   2611 
   2612 define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
   2613 ; ALL-LABEL: combine_undef_input_test20:
   2614 ; ALL:       # BB#0:
   2615 ; ALL-NEXT:    retq
   2616   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   2617   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2618   ret <4 x float> %2
   2619 }
   2620 
   2621 ; These tests are designed to test the ability to combine away unnecessary
   2622 ; operations feeding into a shuffle. The AVX cases are the important ones as
   2623 ; they leverage operations which cannot be done naturally on the entire vector
   2624 ; and thus are decomposed into multiple smaller operations.
   2625 
   2626 define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
   2627 ; SSE-LABEL: combine_unneeded_subvector1:
   2628 ; SSE:       # BB#0:
   2629 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
   2630 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
   2631 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   2632 ; SSE-NEXT:    retq
   2633 ;
   2634 ; AVX1-LABEL: combine_unneeded_subvector1:
   2635 ; AVX1:       # BB#0:
   2636 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2637 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
   2638 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
   2639 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   2640 ; AVX1-NEXT:    retq
   2641 ;
   2642 ; AVX2-LABEL: combine_unneeded_subvector1:
   2643 ; AVX2:       # BB#0:
   2644 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
   2645 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
   2646 ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
   2647 ; AVX2-NEXT:    retq
   2648   %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   2649   %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
   2650   ret <8 x i32> %c
   2651 }
   2652 
   2653 define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
   2654 ; SSE-LABEL: combine_unneeded_subvector2:
   2655 ; SSE:       # BB#0:
   2656 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
   2657 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
   2658 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
   2659 ; SSE-NEXT:    retq
   2660 ;
   2661 ; AVX1-LABEL: combine_unneeded_subvector2:
   2662 ; AVX1:       # BB#0:
   2663 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2664 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
   2665 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   2666 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
   2667 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   2668 ; AVX1-NEXT:    retq
   2669 ;
   2670 ; AVX2-LABEL: combine_unneeded_subvector2:
   2671 ; AVX2:       # BB#0:
   2672 ; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
   2673 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
   2674 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   2675 ; AVX2-NEXT:    retq
   2676   %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   2677   %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
   2678   ret <8 x i32> %d
   2679 }
   2680 
   2681 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
   2682 ; SSE2-LABEL: combine_insertps1:
   2683 ; SSE2:       # BB#0:
   2684 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
   2685 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
   2686 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2687 ; SSE2-NEXT:    retq
   2688 ;
   2689 ; SSSE3-LABEL: combine_insertps1:
   2690 ; SSSE3:       # BB#0:
   2691 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
   2692 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
   2693 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2694 ; SSSE3-NEXT:    retq
   2695 ;
   2696 ; SSE41-LABEL: combine_insertps1:
   2697 ; SSE41:       # BB#0:
   2698 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
   2699 ; SSE41-NEXT:    retq
   2700 ;
   2701 ; AVX-LABEL: combine_insertps1:
   2702 ; AVX:       # BB#0:
   2703 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
   2704 ; AVX-NEXT:    retq
   2705 
   2706   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
   2707   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
   2708   ret <4 x float> %d
   2709 }
   2710 
   2711 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
   2712 ; SSE2-LABEL: combine_insertps2:
   2713 ; SSE2:       # BB#0:
   2714 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
   2715 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
   2716 ; SSE2-NEXT:    movaps %xmm1, %xmm0
   2717 ; SSE2-NEXT:    retq
   2718 ;
   2719 ; SSSE3-LABEL: combine_insertps2:
   2720 ; SSSE3:       # BB#0:
   2721 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
   2722 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
   2723 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
   2724 ; SSSE3-NEXT:    retq
   2725 ;
   2726 ; SSE41-LABEL: combine_insertps2:
   2727 ; SSE41:       # BB#0:
   2728 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
   2729 ; SSE41-NEXT:    retq
   2730 ;
   2731 ; AVX-LABEL: combine_insertps2:
   2732 ; AVX:       # BB#0:
   2733 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
   2734 ; AVX-NEXT:    retq
   2735 
   2736   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
   2737   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   2738   ret <4 x float> %d
   2739 }
   2740 
   2741 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
   2742 ; SSE2-LABEL: combine_insertps3:
   2743 ; SSE2:       # BB#0:
   2744 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
   2745 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
   2746 ; SSE2-NEXT:    retq
   2747 ;
   2748 ; SSSE3-LABEL: combine_insertps3:
   2749 ; SSSE3:       # BB#0:
   2750 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
   2751 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
   2752 ; SSSE3-NEXT:    retq
   2753 ;
   2754 ; SSE41-LABEL: combine_insertps3:
   2755 ; SSE41:       # BB#0:
   2756 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   2757 ; SSE41-NEXT:    retq
   2758 ;
   2759 ; AVX-LABEL: combine_insertps3:
   2760 ; AVX:       # BB#0:
   2761 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   2762 ; AVX-NEXT:    retq
   2763 
   2764   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
   2765   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
   2766   ret <4 x float> %d
   2767 }
   2768 
   2769 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
   2770 ; SSE2-LABEL: combine_insertps4:
   2771 ; SSE2:       # BB#0:
   2772 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
   2773 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
   2774 ; SSE2-NEXT:    retq
   2775 ;
   2776 ; SSSE3-LABEL: combine_insertps4:
   2777 ; SSSE3:       # BB#0:
   2778 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
   2779 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
   2780 ; SSSE3-NEXT:    retq
   2781 ;
   2782 ; SSE41-LABEL: combine_insertps4:
   2783 ; SSE41:       # BB#0:
   2784 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   2785 ; SSE41-NEXT:    retq
   2786 ;
   2787 ; AVX-LABEL: combine_insertps4:
   2788 ; AVX:       # BB#0:
   2789 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   2790 ; AVX-NEXT:    retq
   2791 
   2792   %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
   2793   %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
   2794   ret <4 x float> %d
   2795 }
   2796 
   2797 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
   2798 ; SSE-LABEL: PR22377:
   2799 ; SSE:       # BB#0: # %entry
   2800 ; SSE-NEXT:    movaps %xmm0, %xmm1
   2801 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
   2802 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
   2803 ; SSE-NEXT:    addps %xmm0, %xmm1
   2804 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2805 ; SSE-NEXT:    retq
   2806 ;
   2807 ; AVX-LABEL: PR22377:
   2808 ; AVX:       # BB#0: # %entry
   2809 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
   2810 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
   2811 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
   2812 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2813 ; AVX-NEXT:    retq
   2814 entry:
   2815   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
   2816   %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
   2817   %r2 = fadd <4 x float> %s1, %s2
   2818   %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   2819   ret <4 x float> %s3
   2820 }
   2821 
   2822 define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
   2823 ; SSE2-LABEL: PR22390:
   2824 ; SSE2:       # BB#0: # %entry
   2825 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2826 ; SSE2-NEXT:    movaps %xmm0, %xmm2
   2827 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
   2828 ; SSE2-NEXT:    addps %xmm0, %xmm2
   2829 ; SSE2-NEXT:    movaps %xmm2, %xmm0
   2830 ; SSE2-NEXT:    retq
   2831 ;
   2832 ; SSSE3-LABEL: PR22390:
   2833 ; SSSE3:       # BB#0: # %entry
   2834 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2835 ; SSSE3-NEXT:    movaps %xmm0, %xmm2
   2836 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
   2837 ; SSSE3-NEXT:    addps %xmm0, %xmm2
   2838 ; SSSE3-NEXT:    movaps %xmm2, %xmm0
   2839 ; SSSE3-NEXT:    retq
   2840 ;
   2841 ; SSE41-LABEL: PR22390:
   2842 ; SSE41:       # BB#0: # %entry
   2843 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2844 ; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
   2845 ; SSE41-NEXT:    addps %xmm1, %xmm0
   2846 ; SSE41-NEXT:    retq
   2847 ;
   2848 ; AVX-LABEL: PR22390:
   2849 ; AVX:       # BB#0: # %entry
   2850 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
   2851 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
   2852 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   2853 ; AVX-NEXT:    retq
   2854 entry:
   2855   %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   2856   %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   2857   %r2 = fadd <4 x float> %s1, %s2
   2858   ret <4 x float> %r2
   2859 }
   2860 
   2861 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
   2862 ; SSE2-LABEL: PR22412:
   2863 ; SSE2:       # BB#0: # %entry
   2864 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
   2865 ; SSE2-NEXT:    movapd %xmm2, %xmm0
   2866 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
   2867 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
   2868 ; SSE2-NEXT:    movaps %xmm3, %xmm1
   2869 ; SSE2-NEXT:    retq
   2870 ;
   2871 ; SSSE3-LABEL: PR22412:
   2872 ; SSSE3:       # BB#0: # %entry
   2873 ; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
   2874 ; SSSE3-NEXT:    movapd %xmm2, %xmm0
   2875 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
   2876 ; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
   2877 ; SSSE3-NEXT:    movaps %xmm3, %xmm1
   2878 ; SSSE3-NEXT:    retq
   2879 ;
   2880 ; SSE41-LABEL: PR22412:
   2881 ; SSE41:       # BB#0: # %entry
   2882 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
   2883 ; SSE41-NEXT:    movapd %xmm0, %xmm1
   2884 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
   2885 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
   2886 ; SSE41-NEXT:    movaps %xmm1, %xmm0
   2887 ; SSE41-NEXT:    movaps %xmm3, %xmm1
   2888 ; SSE41-NEXT:    retq
   2889 ;
   2890 ; AVX1-LABEL: PR22412:
   2891 ; AVX1:       # BB#0: # %entry
   2892 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
   2893 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
   2894 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
   2895 ; AVX1-NEXT:    retq
   2896 ;
   2897 ; AVX2-LABEL: PR22412:
   2898 ; AVX2:       # BB#0: # %entry
   2899 ; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
   2900 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
   2901 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   2902 ; AVX2-NEXT:    retq
   2903 entry:
   2904   %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   2905   %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
   2906   ret <8 x float> %s2
   2907 }
   2908