Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX2
      3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512
      4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512
      6 
      7 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
      8 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
      9 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
     10 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
     11 
     12 define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
     13 ; X32-LABEL: combine_pshufb_pslldq:
     14 ; X32:       # %bb.0:
     15 ; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     16 ; X32-NEXT:    retl
     17 ;
     18 ; X64-LABEL: combine_pshufb_pslldq:
     19 ; X64:       # %bb.0:
     20 ; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     21 ; X64-NEXT:    retq
     22   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
     23   %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
     24   ret <32 x i8> %2
     25 }
     26 
     27 define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
     28 ; X32-LABEL: combine_pshufb_psrldq:
     29 ; X32:       # %bb.0:
     30 ; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     31 ; X32-NEXT:    retl
     32 ;
     33 ; X64-LABEL: combine_pshufb_psrldq:
     34 ; X64:       # %bb.0:
     35 ; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     36 ; X64-NEXT:    retq
     37   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
     38   %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
     39   ret <32 x i8> %2
     40 }
     41 
     42 define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
     43 ; X32-LABEL: combine_pshufb_vpermd:
     44 ; X32:       # %bb.0:
     45 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
     46 ; X32-NEXT:    retl
     47 ;
     48 ; X64-LABEL: combine_pshufb_vpermd:
     49 ; X64:       # %bb.0:
     50 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
     51 ; X64-NEXT:    retq
     52   %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
     53   %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
     54   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
     55   ret <32 x i8> %tmp2
     56 }
     57 
     58 define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
     59 ; X32-LABEL: combine_pshufb_vpermps:
     60 ; X32:       # %bb.0:
     61 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
     62 ; X32-NEXT:    retl
     63 ;
     64 ; X64-LABEL: combine_pshufb_vpermps:
     65 ; X64:       # %bb.0:
     66 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
     67 ; X64-NEXT:    retq
     68   %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
     69   %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
     70   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
     71   ret <32 x i8> %tmp2
     72 }
     73 
     74 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
     75 ; X32-LABEL: combine_and_pshufb:
     76 ; X32:       # %bb.0:
     77 ; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     78 ; X32-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
     79 ; X32-NEXT:    retl
     80 ;
     81 ; X64-LABEL: combine_and_pshufb:
     82 ; X64:       # %bb.0:
     83 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     84 ; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
     85 ; X64-NEXT:    retq
     86   %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
     87   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
     88   ret <32 x i8> %2
     89 }
     90 
     91 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
     92 ; X32-LABEL: combine_pshufb_and:
     93 ; X32:       # %bb.0:
     94 ; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     95 ; X32-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
     96 ; X32-NEXT:    retl
     97 ;
     98 ; X64-LABEL: combine_pshufb_and:
     99 ; X64:       # %bb.0:
    100 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    101 ; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
    102 ; X64-NEXT:    retq
    103   %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    104   %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    105   ret <32 x i8> %2
    106 }
    107 
    108 define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
    109 ; X32-LABEL: combine_permq_pshufb_as_vperm2i128:
    110 ; X32:       # %bb.0:
    111 ; X32-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    112 ; X32-NEXT:    vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
    113 ; X32-NEXT:    retl
    114 ;
    115 ; X64-LABEL: combine_permq_pshufb_as_vperm2i128:
    116 ; X64:       # %bb.0:
    117 ; X64-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
    118 ; X64-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
    119 ; X64-NEXT:    retq
    120   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    121   %2 = bitcast <4 x i64> %1 to <32 x i8>
    122   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
    123   %4 = bitcast <32 x i8> %3 to <4 x i64>
    124   %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
    125   ret <4 x i64> %5
    126 }
    127 
    128 define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
    129 ; X32-LABEL: combine_as_vpermd:
    130 ; X32:       # %bb.0:
    131 ; X32-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
    132 ; X32-NEXT:    vpermps %ymm0, %ymm1, %ymm0
    133 ; X32-NEXT:    retl
    134 ;
    135 ; X64-LABEL: combine_as_vpermd:
    136 ; X64:       # %bb.0:
    137 ; X64-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
    138 ; X64-NEXT:    vpermps %ymm0, %ymm1, %ymm0
    139 ; X64-NEXT:    retq
    140   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
    141   %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
    142   %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3>
    143   ret <8 x i32> %3
    144 }
    145 
    146 define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
    147 ; X32-LABEL: combine_as_vpermps:
    148 ; X32:       # %bb.0:
    149 ; X32-NEXT:    vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
    150 ; X32-NEXT:    vpermps %ymm0, %ymm1, %ymm0
    151 ; X32-NEXT:    retl
    152 ;
    153 ; X64-LABEL: combine_as_vpermps:
    154 ; X64:       # %bb.0:
    155 ; X64-NEXT:    vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
    156 ; X64-NEXT:    vpermps %ymm0, %ymm1, %ymm0
    157 ; X64-NEXT:    retq
    158   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
    159   %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>)
    160   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3>
    161   ret <8 x float> %3
    162 }
    163 
    164 define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
    165 ; X32-LABEL: combine_permq_pshufb_as_vpblendd:
    166 ; X32:       # %bb.0:
    167 ; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    168 ; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
    169 ; X32-NEXT:    retl
    170 ;
    171 ; X64-LABEL: combine_permq_pshufb_as_vpblendd:
    172 ; X64:       # %bb.0:
    173 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    174 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
    175 ; X64-NEXT:    retq
    176   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
    177   %2 = bitcast <4 x i64> %1 to <32 x i8>
    178   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
    179   ret <32 x i8> %3
    180 }
    181 
    182 define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
    183 ; X32-LABEL: combine_pshufb_as_vpbroadcastb128:
    184 ; X32:       # %bb.0:
    185 ; X32-NEXT:    vpbroadcastb %xmm0, %xmm0
    186 ; X32-NEXT:    retl
    187 ;
    188 ; X64-LABEL: combine_pshufb_as_vpbroadcastb128:
    189 ; X64:       # %bb.0:
    190 ; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
    191 ; X64-NEXT:    retq
    192   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
    193   ret <16 x i8> %1
    194 }
    195 
    196 define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
    197 ; X32-LABEL: combine_pshufb_as_vpbroadcastb256:
    198 ; X32:       # %bb.0:
    199 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    200 ; X32-NEXT:    vpbroadcastb %xmm0, %ymm0
    201 ; X32-NEXT:    retl
    202 ;
    203 ; X64-LABEL: combine_pshufb_as_vpbroadcastb256:
    204 ; X64:       # %bb.0:
    205 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    206 ; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
    207 ; X64-NEXT:    retq
    208   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
    209   %2 = bitcast <4 x i64> %1 to <32 x i8>
    210   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
    211   %4 = bitcast <32 x i8> %3 to <8 x i32>
    212   %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
    213   %6 = bitcast <8 x i32> %5 to <32 x i8>
    214   ret <32 x i8> %6
    215 }
    216 
    217 define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
    218 ; X32-LABEL: combine_pshufb_as_vpbroadcastw128:
    219 ; X32:       # %bb.0:
    220 ; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
    221 ; X32-NEXT:    retl
    222 ;
    223 ; X64-LABEL: combine_pshufb_as_vpbroadcastw128:
    224 ; X64:       # %bb.0:
    225 ; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
    226 ; X64-NEXT:    retq
    227   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
    228   ret <16 x i8> %1
    229 }
    230 
    231 define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
    232 ; X32-LABEL: combine_pshufb_as_vpbroadcastw256:
    233 ; X32:       # %bb.0:
    234 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    235 ; X32-NEXT:    vpbroadcastw %xmm0, %ymm0
    236 ; X32-NEXT:    retl
    237 ;
    238 ; X64-LABEL: combine_pshufb_as_vpbroadcastw256:
    239 ; X64:       # %bb.0:
    240 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    241 ; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
    242 ; X64-NEXT:    retq
    243   %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
    244   %2 = bitcast <4 x i64> %1 to <32 x i8>
    245   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
    246   %4 = bitcast <32 x i8> %3 to <8 x i32>
    247   %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
    248   %6 = bitcast <8 x i32> %5 to <32 x i8>
    249   ret <32 x i8> %6
    250 }
    251 
    252 define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
    253 ; X32-LABEL: combine_pshufb_as_vpbroadcastd128:
    254 ; X32:       # %bb.0:
    255 ; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
    256 ; X32-NEXT:    vpaddb {{\.LCPI.*}}, %xmm0, %xmm0
    257 ; X32-NEXT:    retl
    258 ;
    259 ; X64-LABEL: combine_pshufb_as_vpbroadcastd128:
    260 ; X64:       # %bb.0:
    261 ; X64-NEXT:    vpbroadcastd %xmm0, %xmm0
    262 ; X64-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    263 ; X64-NEXT:    retq
    264   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
    265   %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
    266   ret <16 x i8> %2
    267 }
    268 
    269 define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
    270 ; X32-LABEL: combine_permd_as_vpbroadcastd256:
    271 ; X32:       # %bb.0:
    272 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    273 ; X32-NEXT:    vpbroadcastd %xmm0, %ymm0
    274 ; X32-NEXT:    vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
    275 ; X32-NEXT:    retl
    276 ;
    277 ; X64-LABEL: combine_permd_as_vpbroadcastd256:
    278 ; X64:       # %bb.0:
    279 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    280 ; X64-NEXT:    vpbroadcastd %xmm0, %ymm0
    281 ; X64-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
    282 ; X64-NEXT:    retq
    283   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    284   %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
    285   %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    286   ret <8 x i32> %3
    287 }
    288 
    289 define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
    290 ; X32-LABEL: combine_pshufb_as_vpbroadcastq128:
    291 ; X32:       # %bb.0:
    292 ; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
    293 ; X32-NEXT:    retl
    294 ;
    295 ; X64-LABEL: combine_pshufb_as_vpbroadcastq128:
    296 ; X64:       # %bb.0:
    297 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
    298 ; X64-NEXT:    retq
    299   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
    300   ret <16 x i8> %1
    301 }
    302 
    303 define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
    304 ; X32-LABEL: combine_permd_as_vpbroadcastq256:
    305 ; X32:       # %bb.0:
    306 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    307 ; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
    308 ; X32-NEXT:    vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
    309 ; X32-NEXT:    retl
    310 ;
    311 ; X64-LABEL: combine_permd_as_vpbroadcastq256:
    312 ; X64:       # %bb.0:
    313 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    314 ; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
    315 ; X64-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
    316 ; X64-NEXT:    retq
    317   %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    318   %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
    319   %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    320   ret <8 x i32> %3
    321 }
    322 
    323 define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
    324 ; X32-LABEL: combine_pshufb_as_vpbroadcastss128:
    325 ; X32:       # %bb.0:
    326 ; X32-NEXT:    vbroadcastss %xmm0, %xmm0
    327 ; X32-NEXT:    retl
    328 ;
    329 ; X64-LABEL: combine_pshufb_as_vpbroadcastss128:
    330 ; X64:       # %bb.0:
    331 ; X64-NEXT:    vbroadcastss %xmm0, %xmm0
    332 ; X64-NEXT:    retq
    333   %1 = bitcast <4 x float> %a to <16 x i8>
    334   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
    335   %3 = bitcast <16 x i8> %2 to <4 x float>
    336   ret <4 x float> %3
    337 }
    338 
    339 define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
    340 ; X32-LABEL: combine_permps_as_vpbroadcastss256:
    341 ; X32:       # %bb.0:
    342 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    343 ; X32-NEXT:    vbroadcastss %xmm0, %ymm0
    344 ; X32-NEXT:    retl
    345 ;
    346 ; X64-LABEL: combine_permps_as_vpbroadcastss256:
    347 ; X64:       # %bb.0:
    348 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    349 ; X64-NEXT:    vbroadcastss %xmm0, %ymm0
    350 ; X64-NEXT:    retq
    351   %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    352   %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
    353   ret <8 x float> %2
    354 }
    355 
    356 define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
    357 ; X32-LABEL: combine_permps_as_vpbroadcastsd256:
    358 ; X32:       # %bb.0:
    359 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    360 ; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
    361 ; X32-NEXT:    retl
    362 ;
    363 ; X64-LABEL: combine_permps_as_vpbroadcastsd256:
    364 ; X64:       # %bb.0:
    365 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    366 ; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
    367 ; X64-NEXT:    retq
    368   %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
    369   %2 = bitcast <4 x double> %1 to <8 x float>
    370   %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
    371   %4 = bitcast <8 x float> %3 to <4 x double>
    372   ret <4 x double> %4
    373 }
    374 
    375 define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
    376 ; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
    377 ; X32:       # %bb.0:
    378 ; X32-NEXT:    vpbroadcastb %xmm0, %xmm0
    379 ; X32-NEXT:    retl
    380 ;
    381 ; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
    382 ; X64:       # %bb.0:
    383 ; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
    384 ; X64-NEXT:    retq
    385   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
    386   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
    387   ret <16 x i8> %2
    388 }
    389 
    390 define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
    391 ; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
    392 ; X32:       # %bb.0:
    393 ; X32-NEXT:    vpbroadcastb %xmm0, %ymm0
    394 ; X32-NEXT:    retl
    395 ;
    396 ; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
    397 ; X64:       # %bb.0:
    398 ; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
    399 ; X64-NEXT:    retq
    400   %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
    401   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
    402   ret <32 x i8> %2
    403 }
    404 
    405 define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
    406 ; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
    407 ; X32:       # %bb.0:
    408 ; X32-NEXT:    vbroadcastss %xmm0, %xmm0
    409 ; X32-NEXT:    retl
    410 ;
    411 ; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
    412 ; X64:       # %bb.0:
    413 ; X64-NEXT:    vbroadcastss %xmm0, %xmm0
    414 ; X64-NEXT:    retq
    415   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
    416   %2 = bitcast <4 x float> %1 to <16 x i8>
    417   %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
    418   %4 = bitcast <16 x i8> %3 to <4 x float>
    419   ret <4 x float> %4
    420 }
    421 
    422 define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
    423 ; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
    424 ; X32:       # %bb.0:
    425 ; X32-NEXT:    vbroadcastss %xmm0, %ymm0
    426 ; X32-NEXT:    vbroadcastss %xmm0, %ymm0
    427 ; X32-NEXT:    retl
    428 ;
    429 ; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
    430 ; X64:       # %bb.0:
    431 ; X64-NEXT:    vbroadcastss %xmm0, %ymm0
    432 ; X64-NEXT:    vbroadcastss %xmm0, %ymm0
    433 ; X64-NEXT:    retq
    434   %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
    435   %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
    436   ret <8 x float> %2
    437 }
    438 
    439 define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
    440 ; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
    441 ; X32:       # %bb.0:
    442 ; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
    443 ; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
    444 ; X32-NEXT:    retl
    445 ;
    446 ; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
    447 ; X64:       # %bb.0:
    448 ; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
    449 ; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
    450 ; X64-NEXT:    retq
    451   %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
    452   %2 = bitcast <4 x double> %1 to <8 x float>
    453   %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
    454   %4 = bitcast <8 x float> %3 to <4 x double>
    455   ret <4 x double> %4
    456 }
    457 
    458 define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
    459 ; X32-LABEL: combine_permd_as_permq:
    460 ; X32:       # %bb.0:
    461 ; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
    462 ; X32-NEXT:    retl
    463 ;
    464 ; X64-LABEL: combine_permd_as_permq:
    465 ; X64:       # %bb.0:
    466 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
    467 ; X64-NEXT:    retq
    468   %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
    469   ret <8 x i32> %1
    470 }
    471 
    472 define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
    473 ; X32-LABEL: combine_permps_as_permpd:
    474 ; X32:       # %bb.0:
    475 ; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
    476 ; X32-NEXT:    retl
    477 ;
    478 ; X64-LABEL: combine_permps_as_permpd:
    479 ; X64:       # %bb.0:
    480 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
    481 ; X64-NEXT:    retq
    482   %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
    483   ret <8 x float> %1
    484 }
    485 
    486 define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
    487 ; X32-LABEL: combine_pshufb_as_zext:
    488 ; X32:       # %bb.0:
    489 ; X32-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    490 ; X32-NEXT:    retl
    491 ;
    492 ; X64-LABEL: combine_pshufb_as_zext:
    493 ; X64:       # %bb.0:
    494 ; X64-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    495 ; X64-NEXT:    retq
    496   %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    497   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    498   %3 = bitcast <32 x i8> %2 to <4 x i64>
    499   ret <4 x i64> %3
    500 }
    501 
    502 define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
    503 ; X32-LABEL: combine_pshufb_as_zext128:
    504 ; X32:       # %bb.0:
    505 ; X32-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
    506 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
    507 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
    508 ; X32-NEXT:    retl
    509 ;
    510 ; X64-LABEL: combine_pshufb_as_zext128:
    511 ; X64:       # %bb.0:
    512 ; X64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
    513 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
    514 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
    515 ; X64-NEXT:    retq
    516   %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
    517   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    518   %3 = bitcast <32 x i8> %2 to <4 x i64>
    519   ret <4 x i64> %3
    520 }
    521 
    522 define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
    523 ; X32-LABEL: combine_pshufb_as_vzmovl_64:
    524 ; X32:       # %bb.0:
    525 ; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    526 ; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    527 ; X32-NEXT:    retl
    528 ;
    529 ; X64-LABEL: combine_pshufb_as_vzmovl_64:
    530 ; X64:       # %bb.0:
    531 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    532 ; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    533 ; X64-NEXT:    retq
    534   %1 = bitcast <4 x double> %a0 to <32 x i8>
    535   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    536   %3 = bitcast <32 x i8> %2 to <4 x double>
    537   ret <4 x double> %3
    538 }
    539 
    540 define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
    541 ; X32-LABEL: combine_pshufb_as_vzmovl_32:
    542 ; X32:       # %bb.0:
    543 ; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    544 ; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    545 ; X32-NEXT:    retl
    546 ;
    547 ; X64-LABEL: combine_pshufb_as_vzmovl_32:
    548 ; X64:       # %bb.0:
    549 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    550 ; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    551 ; X64-NEXT:    retq
    552   %1 = bitcast <8 x float> %a0 to <32 x i8>
    553   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    554   %3 = bitcast <32 x i8> %2 to <8 x float>
    555   ret <8 x float> %3
    556 }
    557 
    558 define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
    559 ; X32-LABEL: combine_pshufb_as_pslldq:
    560 ; X32:       # %bb.0:
    561 ; X32-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
    562 ; X32-NEXT:    retl
    563 ;
    564 ; X64-LABEL: combine_pshufb_as_pslldq:
    565 ; X64:       # %bb.0:
    566 ; X64-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
    567 ; X64-NEXT:    retq
    568   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
    569   ret <32 x i8> %res0
    570 }
    571 
    572 define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
    573 ; X32-LABEL: combine_pshufb_as_psrldq:
    574 ; X32:       # %bb.0:
    575 ; X32-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    576 ; X32-NEXT:    retl
    577 ;
    578 ; X64-LABEL: combine_pshufb_as_psrldq:
    579 ; X64:       # %bb.0:
    580 ; X64-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    581 ; X64-NEXT:    retq
    582   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
    583   ret <32 x i8> %res0
    584 }
    585 
    586 define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
    587 ; X32-LABEL: combine_pshufb_as_psrlw:
    588 ; X32:       # %bb.0:
    589 ; X32-NEXT:    vpsrlw $8, %ymm0, %ymm0
    590 ; X32-NEXT:    retl
    591 ;
    592 ; X64-LABEL: combine_pshufb_as_psrlw:
    593 ; X64:       # %bb.0:
    594 ; X64-NEXT:    vpsrlw $8, %ymm0, %ymm0
    595 ; X64-NEXT:    retq
    596   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
    597   ret <32 x i8> %res0
    598 }
    599 
    600 define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
    601 ; X32-LABEL: combine_pshufb_as_pslld:
    602 ; X32:       # %bb.0:
    603 ; X32-NEXT:    vpslld $24, %ymm0, %ymm0
    604 ; X32-NEXT:    retl
    605 ;
    606 ; X64-LABEL: combine_pshufb_as_pslld:
    607 ; X64:       # %bb.0:
    608 ; X64-NEXT:    vpslld $24, %ymm0, %ymm0
    609 ; X64-NEXT:    retq
    610   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
    611   ret <32 x i8> %res0
    612 }
    613 
    614 define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
    615 ; X32-LABEL: combine_pshufb_as_psrlq:
    616 ; X32:       # %bb.0:
    617 ; X32-NEXT:    vpsrlq $40, %ymm0, %ymm0
    618 ; X32-NEXT:    retl
    619 ;
    620 ; X64-LABEL: combine_pshufb_as_psrlq:
    621 ; X64:       # %bb.0:
    622 ; X64-NEXT:    vpsrlq $40, %ymm0, %ymm0
    623 ; X64-NEXT:    retq
    624   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
    625   ret <32 x i8> %res0
    626 }
    627 
    628 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
    629 ; X32-LABEL: combine_pshufb_as_pshuflw:
    630 ; X32:       # %bb.0:
    631 ; X32-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
    632 ; X32-NEXT:    retl
    633 ;
    634 ; X64-LABEL: combine_pshufb_as_pshuflw:
    635 ; X64:       # %bb.0:
    636 ; X64-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
    637 ; X64-NEXT:    retq
    638   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
    639   ret <32 x i8> %res0
    640 }
    641 
    642 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
    643 ; X32-LABEL: combine_pshufb_as_pshufhw:
    644 ; X32:       # %bb.0:
    645 ; X32-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
    646 ; X32-NEXT:    retl
    647 ;
    648 ; X64-LABEL: combine_pshufb_as_pshufhw:
    649 ; X64:       # %bb.0:
    650 ; X64-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
    651 ; X64-NEXT:    retq
    652   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
    653   ret <32 x i8> %res0
    654 }
    655 
    656 define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
    657 ; X32-LABEL: combine_pshufb_not_as_pshufw:
    658 ; X32:       # %bb.0:
    659 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
    660 ; X32-NEXT:    retl
    661 ;
    662 ; X64-LABEL: combine_pshufb_not_as_pshufw:
    663 ; X64:       # %bb.0:
    664 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
    665 ; X64-NEXT:    retq
    666   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
    667   %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
    668   ret <32 x i8> %res1
    669 }
    670 
    671 define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
    672 ; X32-LABEL: combine_pshufb_as_unpacklo_undef:
    673 ; X32:       # %bb.0:
    674 ; X32-NEXT:    retl
    675 ;
    676 ; X64-LABEL: combine_pshufb_as_unpacklo_undef:
    677 ; X64:       # %bb.0:
    678 ; X64-NEXT:    retq
    679   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
    680   %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
    681   ret <32 x i8> %2
    682 }
    683 
    684 define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
    685 ; X32-LABEL: combine_pshufb_as_unpacklo_zero:
    686 ; X32:       # %bb.0:
    687 ; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    688 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
    689 ; X32-NEXT:    retl
    690 ;
    691 ; X64-LABEL: combine_pshufb_as_unpacklo_zero:
    692 ; X64:       # %bb.0:
    693 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    694 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
    695 ; X64-NEXT:    retq
    696   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
    697   ret <32 x i8> %1
    698 }
    699 
    700 define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
    701 ; X32-LABEL: combine_pshufb_as_unpackhi_zero:
    702 ; X32:       # %bb.0:
    703 ; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    704 ; X32-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
    705 ; X32-NEXT:    retl
    706 ;
    707 ; X64-LABEL: combine_pshufb_as_unpackhi_zero:
    708 ; X64:       # %bb.0:
    709 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    710 ; X64-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
    711 ; X64-NEXT:    retq
    712   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
    713   ret <32 x i8> %1
    714 }
    715 
    716 define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
    717 ; X32-LABEL: combine_psrlw_pshufb:
    718 ; X32:       # %bb.0:
    719 ; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    720 ; X32-NEXT:    retl
    721 ;
    722 ; X64-LABEL: combine_psrlw_pshufb:
    723 ; X64:       # %bb.0:
    724 ; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    725 ; X64-NEXT:    retq
    726   %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    727   %2 = bitcast <16 x i16> %1 to <32 x i8>
    728   %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>)
    729   ret <32 x i8> %3
    730 }
    731 
    732 define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
    733 ; X32-LABEL: combine_pslld_pshufb:
    734 ; X32:       # %bb.0:
    735 ; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    736 ; X32-NEXT:    retl
    737 ;
    738 ; X64-LABEL: combine_pslld_pshufb:
    739 ; X64:       # %bb.0:
    740 ; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    741 ; X64-NEXT:    retq
    742   %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
    743   %2 = bitcast <8 x i32> %1 to <32 x i8>
    744   %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>)
    745   ret <32 x i8> %3
    746 }
    747 
    748 define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
    749 ; X32-LABEL: combine_psrlq_pshufb:
    750 ; X32:       # %bb.0:
    751 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
    752 ; X32-NEXT:    retl
    753 ;
    754 ; X64-LABEL: combine_psrlq_pshufb:
    755 ; X64:       # %bb.0:
    756 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
    757 ; X64-NEXT:    retq
    758   %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
    759   %2 = bitcast <4 x i64> %1 to <32 x i8>
    760   %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>)
    761   ret <32 x i8> %3
    762 }
    763 
    764 define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
    765 ; X32-LABEL: combine_unpack_unpack_pshufb:
    766 ; X32:       # %bb.0:
    767 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
    768 ; X32-NEXT:    retl
    769 ;
    770 ; X64-LABEL: combine_unpack_unpack_pshufb:
    771 ; X64:       # %bb.0:
    772 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
    773 ; X64-NEXT:    retq
    774   %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
    775   %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    776   %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    777   %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    778   %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    779   %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
    780   ret <32 x i8> %6
    781 }
    782 
    783 define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) {
    784 ; X32-LABEL: shuffle_combine_packssdw_pshufb:
    785 ; X32:       # %bb.0:
    786 ; X32-NEXT:    vpsrad $31, %ymm0, %ymm0
    787 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
    788 ; X32-NEXT:    retl
    789 ;
    790 ; X64-LABEL: shuffle_combine_packssdw_pshufb:
    791 ; X64:       # %bb.0:
    792 ; X64-NEXT:    vpsrad $31, %ymm0, %ymm0
    793 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
    794 ; X64-NEXT:    retq
    795   %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
    796   %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1)
    797   %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
    798   ret <16 x i16> %3
    799 }
    800 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
    801 
    802 define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
    803 ; X32-LABEL: shuffle_combine_packsswb_pshufb:
    804 ; X32:       # %bb.0:
    805 ; X32-NEXT:    vpsraw $15, %ymm0, %ymm0
    806 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
    807 ; X32-NEXT:    retl
    808 ;
    809 ; X64-LABEL: shuffle_combine_packsswb_pshufb:
    810 ; X64:       # %bb.0:
    811 ; X64-NEXT:    vpsraw $15, %ymm0, %ymm0
    812 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
    813 ; X64-NEXT:    retq
    814   %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
    815   %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
    816   %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
    817   %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
    818   ret <32 x i8> %4
    819 }
    820 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
    821 
    822 define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
    823 ; X32-LABEL: shuffle_combine_packusdw_pshufb:
    824 ; X32:       # %bb.0:
    825 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
    826 ; X32-NEXT:    retl
    827 ;
    828 ; X64-LABEL: shuffle_combine_packusdw_pshufb:
    829 ; X64:       # %bb.0:
    830 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
    831 ; X64-NEXT:    retq
    832   %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    833   %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
    834   %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
    835   ret <16 x i16> %3
    836 }
    837 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
    838 
    839 define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
    840 ; X32-LABEL: shuffle_combine_packuswb_pshufb:
    841 ; X32:       # %bb.0:
    842 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
    843 ; X32-NEXT:    retl
    844 ;
    845 ; X64-LABEL: shuffle_combine_packuswb_pshufb:
    846 ; X64:       # %bb.0:
    847 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
    848 ; X64-NEXT:    retq
    849   %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    850   %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    851   %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
    852   %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
    853   ret <32 x i8> %4
    854 }
    855 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
    856 
    857 define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
    858 ; X32-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
    859 ; X32:       # %bb.0:
    860 ; X32-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %xmm0
    861 ; X32-NEXT:    retl
    862 ;
    863 ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
    864 ; X64:       # %bb.0:
    865 ; X64-NEXT:    vmovq %rdi, %xmm0
    866 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
    867 ; X64-NEXT:    retq
    868   %1 = insertelement <2 x i64> undef, i64 %a0, i32 0
    869   %2 = bitcast <2 x i64> %1 to <16 x i8>
    870   %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
    871   ret <16 x i8> %3
    872 }
    873 
    874 define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
    875 ; X32-LABEL: combine_permd_insertion_as_broadcast_v4i64:
    876 ; X32:       # %bb.0:
    877 ; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
    878 ; X32-NEXT:    retl
    879 ;
    880 ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
    881 ; X64:       # %bb.0:
    882 ; X64-NEXT:    vmovq %rdi, %xmm0
    883 ; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
    884 ; X64-NEXT:    retq
    885   %1 = insertelement <4 x i64> undef, i64 %a0, i32 0
    886   %2 = bitcast <4 x i64> %1 to <8 x i32>
    887   %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
    888   ret <8 x i32> %3
    889 }
    890 
    891 define <8 x i32> @constant_fold_permd() {
    892 ; X32-LABEL: constant_fold_permd:
    893 ; X32:       # %bb.0:
    894 ; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
    895 ; X32-NEXT:    retl
    896 ;
    897 ; X64-LABEL: constant_fold_permd:
    898 ; X64:       # %bb.0:
    899 ; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
    900 ; X64-NEXT:    retq
    901   %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
    902   ret <8 x i32> %1
    903 }
    904 
    905 define <8 x float> @constant_fold_permps() {
    906 ; X32-LABEL: constant_fold_permps:
    907 ; X32:       # %bb.0:
    908 ; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
    909 ; X32-NEXT:    retl
    910 ;
    911 ; X64-LABEL: constant_fold_permps:
    912 ; X64:       # %bb.0:
    913 ; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
    914 ; X64-NEXT:    retq
    915   %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
    916   ret <8 x float> %1
    917 }
    918 
    919 define <32 x i8> @constant_fold_pshufb_256() {
    920 ; X32-LABEL: constant_fold_pshufb_256:
    921 ; X32:       # %bb.0:
    922 ; X32-NEXT:    vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
    923 ; X32-NEXT:    retl
    924 ;
    925 ; X64-LABEL: constant_fold_pshufb_256:
    926 ; X64:       # %bb.0:
    927 ; X64-NEXT:    vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
    928 ; X64-NEXT:    retq
    929   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
    930   ret <32 x i8> %1
    931 }
    932 
    933 define <32 x i8> @PR27320(<8 x i32> %a0) {
    934 ; X32-LABEL: PR27320:
    935 ; X32:       # %bb.0:
    936 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
    937 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
    938 ; X32-NEXT:    retl
    939 ;
    940 ; X64-LABEL: PR27320:
    941 ; X64:       # %bb.0:
    942 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
    943 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
    944 ; X64-NEXT:    retq
    945   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
    946   %2 = bitcast <8 x i32> %1 to <32 x i8>
    947   %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
    948   ret <32 x i8> %3
    949 }
    950 
    951 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
    952 ; X32-LABEL: PR34577:
    953 ; X32:       # %bb.0: # %entry
    954 ; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
    955 ; X32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    956 ; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
    957 ; X32-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
    958 ; X32-NEXT:    vpermps %ymm1, %ymm2, %ymm1
    959 ; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
    960 ; X32-NEXT:    retl
    961 ;
    962 ; X64-LABEL: PR34577:
    963 ; X64:       # %bb.0: # %entry
    964 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
    965 ; X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    966 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
    967 ; X64-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
    968 ; X64-NEXT:    vpermps %ymm1, %ymm2, %ymm1
    969 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
    970 ; X64-NEXT:    retq
    971 entry:
    972   %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
    973   %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
    974   %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3>
    975   %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2>
    976   ret <8 x float> %shuf2
    977 }
    978