Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
      7 ;
      8 ; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB)
      9 
     10 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
     11 
     12 define <16 x i8> @combine_vpshufb_as_zero(<16 x i8> %a0) {
     13 ; SSE-LABEL: combine_vpshufb_as_zero:
     14 ; SSE:       # %bb.0:
     15 ; SSE-NEXT:    xorps %xmm0, %xmm0
     16 ; SSE-NEXT:    retq
     17 ;
     18 ; AVX-LABEL: combine_vpshufb_as_zero:
     19 ; AVX:       # %bb.0:
     20 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     21 ; AVX-NEXT:    retq
     22   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
     23   %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
     24   %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
     25   ret <16 x i8> %res2
     26 }
     27 
     28 define <16 x i8> @combine_vpshufb_as_movq(<16 x i8> %a0) {
     29 ; SSE-LABEL: combine_vpshufb_as_movq:
     30 ; SSE:       # %bb.0:
     31 ; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
     32 ; SSE-NEXT:    retq
     33 ;
     34 ; AVX-LABEL: combine_vpshufb_as_movq:
     35 ; AVX:       # %bb.0:
     36 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
     37 ; AVX-NEXT:    retq
     38   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>)
     39   %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>)
     40   ret <16 x i8> %res1
     41 }
     42 
     43 define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1) {
     44 ; SSSE3-LABEL: combine_pshufb_as_movsd:
     45 ; SSSE3:       # %bb.0:
     46 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
     47 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
     48 ; SSSE3-NEXT:    retq
     49 ;
     50 ; SSE41-LABEL: combine_pshufb_as_movsd:
     51 ; SSE41:       # %bb.0:
     52 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
     53 ; SSE41-NEXT:    retq
     54 ;
     55 ; AVX-LABEL: combine_pshufb_as_movsd:
     56 ; AVX:       # %bb.0:
     57 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
     58 ; AVX-NEXT:    retq
     59   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 3, i32 0>
     60   %2 = bitcast <2 x double> %1 to <16 x i8>
     61   %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
     62   %4 = bitcast <16 x i8> %3 to <2 x double>
     63   ret <2 x double> %4
     64 }
     65 
     66 define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
     67 ; SSSE3-LABEL: combine_pshufb_as_movss:
     68 ; SSSE3:       # %bb.0:
     69 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
     70 ; SSSE3-NEXT:    retq
     71 ;
     72 ; SSE41-LABEL: combine_pshufb_as_movss:
     73 ; SSE41:       # %bb.0:
     74 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
     75 ; SSE41-NEXT:    retq
     76 ;
     77 ; AVX-LABEL: combine_pshufb_as_movss:
     78 ; AVX:       # %bb.0:
     79 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
     80 ; AVX-NEXT:    retq
     81   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 3, i32 2, i32 1>
     82   %2 = bitcast <4 x float> %1 to <16 x i8>
     83   %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 12, i8 13, i8 14, i8 15, i8 8, i8 9, i8 10, i8 11, i8 4, i8 5, i8 6, i8 7>)
     84   %4 = bitcast <16 x i8> %3 to <4 x float>
     85   ret <4 x float> %4
     86 }
     87 
     88 define <4 x i32> @combine_pshufb_as_zext(<16 x i8> %a0) {
     89 ; SSSE3-LABEL: combine_pshufb_as_zext:
     90 ; SSSE3:       # %bb.0:
     91 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
     92 ; SSSE3-NEXT:    retq
     93 ;
     94 ; SSE41-LABEL: combine_pshufb_as_zext:
     95 ; SSE41:       # %bb.0:
     96 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
     97 ; SSE41-NEXT:    retq
     98 ;
     99 ; AVX-LABEL: combine_pshufb_as_zext:
    100 ; AVX:       # %bb.0:
    101 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    102 ; AVX-NEXT:    retq
    103   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 3, i8 -1, i8 -1, i8 -1>)
    104   %2 = bitcast <16 x i8> %1 to <4 x i32>
    105   ret <4 x i32> %2
    106 }
    107 
    108 define <2 x double> @combine_pshufb_as_vzmovl_64(<2 x double> %a0) {
    109 ; SSE-LABEL: combine_pshufb_as_vzmovl_64:
    110 ; SSE:       # %bb.0:
    111 ; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
    112 ; SSE-NEXT:    retq
    113 ;
    114 ; AVX-LABEL: combine_pshufb_as_vzmovl_64:
    115 ; AVX:       # %bb.0:
    116 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
    117 ; AVX-NEXT:    retq
    118   %1 = bitcast <2 x double> %a0 to <16 x i8>
    119   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    120   %3 = bitcast <16 x i8> %2 to <2 x double>
    121   ret <2 x double> %3
    122 }
    123 
    124 define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) {
    125 ; SSSE3-LABEL: combine_pshufb_as_vzmovl_32:
    126 ; SSSE3:       # %bb.0:
    127 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
    128 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
    129 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
    130 ; SSSE3-NEXT:    retq
    131 ;
    132 ; SSE41-LABEL: combine_pshufb_as_vzmovl_32:
    133 ; SSE41:       # %bb.0:
    134 ; SSE41-NEXT:    xorps %xmm1, %xmm1
    135 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    136 ; SSE41-NEXT:    retq
    137 ;
    138 ; AVX-LABEL: combine_pshufb_as_vzmovl_32:
    139 ; AVX:       # %bb.0:
    140 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    141 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    142 ; AVX-NEXT:    retq
    143   %1 = bitcast <4 x float> %a0 to <16 x i8>
    144   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    145   %3 = bitcast <16 x i8> %2 to <4 x float>
    146   ret <4 x float> %3
    147 }
    148 
    149 define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) {
    150 ; SSE-LABEL: combine_pshufb_movddup:
    151 ; SSE:       # %bb.0:
    152 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
    153 ; SSE-NEXT:    retq
    154 ;
    155 ; AVX-LABEL: combine_pshufb_movddup:
    156 ; AVX:       # %bb.0:
    157 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
    158 ; AVX-NEXT:    retq
    159   %1 = bitcast <4 x float> %a0 to <16 x i8>
    160   %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
    161   %3 = bitcast <16 x i8> %2 to <4 x float>
    162   %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    163   ret <4 x float> %4
    164 }
    165 
    166 define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) {
    167 ; SSE-LABEL: combine_pshufb_movshdup:
    168 ; SSE:       # %bb.0:
    169 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
    170 ; SSE-NEXT:    retq
    171 ;
    172 ; AVX-LABEL: combine_pshufb_movshdup:
    173 ; AVX:       # %bb.0:
    174 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
    175 ; AVX-NEXT:    retq
    176   %1 = bitcast <4 x float> %a0 to <16 x i8>
    177   %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
    178   %3 = bitcast <16 x i8> %2 to <4 x float>
    179   %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
    180   ret <4 x float> %4
    181 }
    182 
    183 define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) {
    184 ; SSE-LABEL: combine_pshufb_movsldup:
    185 ; SSE:       # %bb.0:
    186 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
    187 ; SSE-NEXT:    retq
    188 ;
    189 ; AVX-LABEL: combine_pshufb_movsldup:
    190 ; AVX:       # %bb.0:
    191 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
    192 ; AVX-NEXT:    retq
    193   %1 = bitcast <4 x float> %a0 to <16 x i8>
    194   %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
    195   %3 = bitcast <16 x i8> %2 to <4 x float>
    196   %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    197   ret <4 x float> %4
    198 }
    199 
    200 define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) {
    201 ; SSE-LABEL: combine_pshufb_palignr:
    202 ; SSE:       # %bb.0:
    203 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
    204 ; SSE-NEXT:    retq
    205 ;
    206 ; AVX-LABEL: combine_pshufb_palignr:
    207 ; AVX:       # %bb.0:
    208 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
    209 ; AVX-NEXT:    retq
    210   %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
    211   %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
    212   ret <16 x i8> %2
    213 }
    214 
    215 define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) {
    216 ; SSE-LABEL: combine_pshufb_pslldq:
    217 ; SSE:       # %bb.0:
    218 ; SSE-NEXT:    xorps %xmm0, %xmm0
    219 ; SSE-NEXT:    retq
    220 ;
    221 ; AVX-LABEL: combine_pshufb_pslldq:
    222 ; AVX:       # %bb.0:
    223 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    224 ; AVX-NEXT:    retq
    225   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
    226   %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    227   ret <16 x i8> %2
    228 }
    229 
    230 define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
    231 ; SSE-LABEL: combine_pshufb_psrldq:
    232 ; SSE:       # %bb.0:
    233 ; SSE-NEXT:    xorps %xmm0, %xmm0
    234 ; SSE-NEXT:    retq
    235 ;
    236 ; AVX-LABEL: combine_pshufb_psrldq:
    237 ; AVX:       # %bb.0:
    238 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    239 ; AVX-NEXT:    retq
    240   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
    241   %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    242   ret <16 x i8> %2
    243 }
    244 
    245 define <16 x i8> @combine_and_pshufb(<16 x i8> %a0) {
    246 ; SSSE3-LABEL: combine_and_pshufb:
    247 ; SSSE3:       # %bb.0:
    248 ; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
    249 ; SSSE3-NEXT:    retq
    250 ;
    251 ; SSE41-LABEL: combine_and_pshufb:
    252 ; SSE41:       # %bb.0:
    253 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    254 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
    255 ; SSE41-NEXT:    retq
    256 ;
    257 ; AVX-LABEL: combine_and_pshufb:
    258 ; AVX:       # %bb.0:
    259 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    260 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
    261 ; AVX-NEXT:    retq
    262   %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    263   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    264   ret <16 x i8> %2
    265 }
    266 
    267 define <16 x i8> @combine_pshufb_and(<16 x i8> %a0) {
    268 ; SSSE3-LABEL: combine_pshufb_and:
    269 ; SSSE3:       # %bb.0:
    270 ; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
    271 ; SSSE3-NEXT:    retq
    272 ;
    273 ; SSE41-LABEL: combine_pshufb_and:
    274 ; SSE41:       # %bb.0:
    275 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    276 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
    277 ; SSE41-NEXT:    retq
    278 ;
    279 ; AVX-LABEL: combine_pshufb_and:
    280 ; AVX:       # %bb.0:
    281 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    282 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
    283 ; AVX-NEXT:    retq
    284   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
    285   %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    286   ret <16 x i8> %2
    287 }
    288 
    289 define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) {
    290 ; SSE-LABEL: combine_pshufb_as_palignr:
    291 ; SSE:       # %bb.0:
    292 ; SSE-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
    293 ; SSE-NEXT:    retq
    294 ;
    295 ; AVX-LABEL: combine_pshufb_as_palignr:
    296 ; AVX:       # %bb.0:
    297 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
    298 ; AVX-NEXT:    retq
    299   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 undef, i8 undef, i8 0>)
    300   ret <16 x i8> %res0
    301 }
    302 
    303 define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
    304 ; SSE-LABEL: combine_pshufb_as_pslldq:
    305 ; SSE:       # %bb.0:
    306 ; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
    307 ; SSE-NEXT:    retq
    308 ;
    309 ; AVX-LABEL: combine_pshufb_as_pslldq:
    310 ; AVX:       # %bb.0:
    311 ; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
    312 ; AVX-NEXT:    retq
    313   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
    314   ret <16 x i8> %res0
    315 }
    316 
    317 define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
    318 ; SSE-LABEL: combine_pshufb_as_psrldq:
    319 ; SSE:       # %bb.0:
    320 ; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    321 ; SSE-NEXT:    retq
    322 ;
    323 ; AVX-LABEL: combine_pshufb_as_psrldq:
    324 ; AVX:       # %bb.0:
    325 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    326 ; AVX-NEXT:    retq
    327   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
    328   ret <16 x i8> %res0
    329 }
    330 
    331 define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) {
    332 ; SSE-LABEL: combine_pshufb_as_psrlw:
    333 ; SSE:       # %bb.0:
    334 ; SSE-NEXT:    psrlw $8, %xmm0
    335 ; SSE-NEXT:    retq
    336 ;
    337 ; AVX-LABEL: combine_pshufb_as_psrlw:
    338 ; AVX:       # %bb.0:
    339 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
    340 ; AVX-NEXT:    retq
    341   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128>)
    342   ret <16 x i8> %res0
    343 }
    344 
    345 define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) {
    346 ; SSE-LABEL: combine_pshufb_as_pslld:
    347 ; SSE:       # %bb.0:
    348 ; SSE-NEXT:    pslld $24, %xmm0
    349 ; SSE-NEXT:    retq
    350 ;
    351 ; AVX-LABEL: combine_pshufb_as_pslld:
    352 ; AVX:       # %bb.0:
    353 ; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
    354 ; AVX-NEXT:    retq
    355   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12>)
    356   ret <16 x i8> %res0
    357 }
    358 
    359 define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) {
    360 ; SSE-LABEL: combine_pshufb_as_psrlq:
    361 ; SSE:       # %bb.0:
    362 ; SSE-NEXT:    psrlq $40, %xmm0
    363 ; SSE-NEXT:    retq
    364 ;
    365 ; AVX-LABEL: combine_pshufb_as_psrlq:
    366 ; AVX:       # %bb.0:
    367 ; AVX-NEXT:    vpsrlq $40, %xmm0, %xmm0
    368 ; AVX-NEXT:    retq
    369   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128>)
    370   ret <16 x i8> %res0
    371 }
    372 
    373 define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
    374 ; SSE-LABEL: combine_pshufb_as_pshuflw:
    375 ; SSE:       # %bb.0:
    376 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
    377 ; SSE-NEXT:    retq
    378 ;
    379 ; AVX-LABEL: combine_pshufb_as_pshuflw:
    380 ; AVX:       # %bb.0:
    381 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
    382 ; AVX-NEXT:    retq
    383   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
    384   ret <16 x i8> %res0
    385 }
    386 
    387 define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
    388 ; SSE-LABEL: combine_pshufb_as_pshufhw:
    389 ; SSE:       # %bb.0:
    390 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
    391 ; SSE-NEXT:    retq
    392 ;
    393 ; AVX-LABEL: combine_pshufb_as_pshufhw:
    394 ; AVX:       # %bb.0:
    395 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
    396 ; AVX-NEXT:    retq
    397   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
    398   ret <16 x i8> %res0
    399 }
    400 
    401 define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
    402 ; SSE-LABEL: combine_pshufb_not_as_pshufw:
    403 ; SSE:       # %bb.0:
    404 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
    405 ; SSE-NEXT:    retq
    406 ;
    407 ; AVX-LABEL: combine_pshufb_not_as_pshufw:
    408 ; AVX:       # %bb.0:
    409 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
    410 ; AVX-NEXT:    retq
    411   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
    412   %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
    413   ret <16 x i8> %res1
    414 }
    415 
    416 define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
    417 ; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
    418 ; SSE:       # %bb.0:
    419 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
    420 ; SSE-NEXT:    retq
    421 ;
    422 ; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
    423 ; AVX:       # %bb.0:
    424 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
    425 ; AVX-NEXT:    retq
    426   %res0 = load <16 x i8>, <16 x i8> *%a0, align 16
    427   %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
    428   ret <16 x i8> %res1
    429 }
    430 
    431 define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
    432 ; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
    433 ; SSE:       # %bb.0:
    434 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    435 ; SSE-NEXT:    retq
    436 ;
    437 ; AVX-LABEL: combine_pshufb_as_unary_unpcklbw:
    438 ; AVX:       # %bb.0:
    439 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    440 ; AVX-NEXT:    retq
    441   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>)
    442   ret <16 x i8> %1
    443 }
    444 
    445 define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
    446 ; SSE-LABEL: combine_pshufb_as_unary_unpckhwd:
    447 ; SSE:       # %bb.0:
    448 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
    449 ; SSE-NEXT:    retq
    450 ;
    451 ; AVX-LABEL: combine_pshufb_as_unary_unpckhwd:
    452 ; AVX:       # %bb.0:
    453 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
    454 ; AVX-NEXT:    retq
    455   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>)
    456   ret <16 x i8> %1
    457 }
    458 
    459 define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) {
    460 ; ALL-LABEL: combine_pshufb_as_unpacklo_undef:
    461 ; ALL:       # %bb.0:
    462 ; ALL-NEXT:    retq
    463   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 2, i8 3, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 6, i8 7>)
    464   %2 = bitcast <16 x i8> %1 to <8 x i16>
    465   %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    466   ret <8 x i16> %3
    467 }
    468 
    469 define <16 x i8> @combine_pshufb_as_unpackhi_undef(<16 x i8> %a0) {
    470 ; ALL-LABEL: combine_pshufb_as_unpackhi_undef:
    471 ; ALL:       # %bb.0:
    472 ; ALL-NEXT:    retq
    473   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 undef, i8 10, i8 undef, i8 11, i8 undef, i8 12, i8 undef, i8 13, i8 undef, i8 14, i8 undef, i8 15, i8 undef>)
    474   %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
    475   ret <16 x i8> %2
    476 }
    477 
    478 define <16 x i8> @combine_pshufb_as_unpacklo_zero(<16 x i8> %a0) {
    479 ; SSE-LABEL: combine_pshufb_as_unpacklo_zero:
    480 ; SSE:       # %bb.0:
    481 ; SSE-NEXT:    xorps %xmm1, %xmm1
    482 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    483 ; SSE-NEXT:    movaps %xmm1, %xmm0
    484 ; SSE-NEXT:    retq
    485 ;
    486 ; AVX-LABEL: combine_pshufb_as_unpacklo_zero:
    487 ; AVX:       # %bb.0:
    488 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    489 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    490 ; AVX-NEXT:    retq
    491   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 6, i8 7>)
    492   ret <16 x i8> %1
    493 }
    494 
    495 define <16 x i8> @combine_pshufb_as_unpackhi_zero(<16 x i8> %a0) {
    496 ; SSE-LABEL: combine_pshufb_as_unpackhi_zero:
    497 ; SSE:       # %bb.0:
    498 ; SSE-NEXT:    pxor %xmm1, %xmm1
    499 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
    500 ; SSE-NEXT:    retq
    501 ;
    502 ; AVX-LABEL: combine_pshufb_as_unpackhi_zero:
    503 ; AVX:       # %bb.0:
    504 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    505 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
    506 ; AVX-NEXT:    retq
    507   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1>)
    508   ret <16 x i8> %1
    509 }
    510 
    511 define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) {
    512 ; SSE-LABEL: combine_psrlw_pshufb:
    513 ; SSE:       # %bb.0:
    514 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
    515 ; SSE-NEXT:    retq
    516 ;
    517 ; AVX-LABEL: combine_psrlw_pshufb:
    518 ; AVX:       # %bb.0:
    519 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
    520 ; AVX-NEXT:    retq
    521   %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    522   %2 = bitcast <8 x i16> %1 to <16 x i8>
    523   %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>)
    524   ret <16 x i8> %3
    525 }
    526 
    527 define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) {
    528 ; SSE-LABEL: combine_pslld_pshufb:
    529 ; SSE:       # %bb.0:
    530 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
    531 ; SSE-NEXT:    retq
    532 ;
    533 ; AVX-LABEL: combine_pslld_pshufb:
    534 ; AVX:       # %bb.0:
    535 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
    536 ; AVX-NEXT:    retq
    537   %1 = shl <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
    538   %2 = bitcast <4 x i32> %1 to <16 x i8>
    539   %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>)
    540   ret <16 x i8> %3
    541 }
    542 
    543 define <16 x i8> @combine_psrlq_pshufb(<2 x i64> %a0) {
    544 ; SSE-LABEL: combine_psrlq_pshufb:
    545 ; SSE:       # %bb.0:
    546 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
    547 ; SSE-NEXT:    retq
    548 ;
    549 ; AVX-LABEL: combine_psrlq_pshufb:
    550 ; AVX:       # %bb.0:
    551 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
    552 ; AVX-NEXT:    retq
    553   %1 = lshr <2 x i64> %a0, <i64 48, i64 48>
    554   %2 = bitcast <2 x i64> %1 to <16 x i8>
    555   %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>)
    556   ret <16 x i8> %3
    557 }
    558 
    559 define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
    560 ; SSE-LABEL: combine_unpckl_arg0_pshufb:
    561 ; SSE:       # %bb.0:
    562 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
    563 ; SSE-NEXT:    retq
    564 ;
    565 ; AVX-LABEL: combine_unpckl_arg0_pshufb:
    566 ; AVX:       # %bb.0:
    567 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
    568 ; AVX-NEXT:    retq
    569   %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
    570   %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>)
    571   ret <16 x i8> %2
    572 }
    573 
    574 define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
    575 ; SSE-LABEL: combine_unpckl_arg1_pshufb:
    576 ; SSE:       # %bb.0:
    577 ; SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
    578 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    579 ; SSE-NEXT:    retq
    580 ;
    581 ; AVX-LABEL: combine_unpckl_arg1_pshufb:
    582 ; AVX:       # %bb.0:
    583 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
    584 ; AVX-NEXT:    retq
    585   %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
    586   %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1>)
    587   ret <16 x i8> %2
    588 }
    589 
    590 define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) {
    591 ; SSE-LABEL: shuffle_combine_unpack_insert:
    592 ; SSE:       # %bb.0:
    593 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
    594 ; SSE-NEXT:    retq
    595 ;
    596 ; AVX-LABEL: shuffle_combine_unpack_insert:
    597 ; AVX:       # %bb.0:
    598 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
    599 ; AVX-NEXT:    retq
    600   %1 = extractelement <8 x i16> %a0, i32 2
    601   %2 = extractelement <8 x i16> %a0, i32 4
    602   %3 = insertelement <8 x i16> %a0, i16 %1, i32 4
    603   %4 = insertelement <8 x i16> %a0, i16 %2, i32 2
    604   %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
    605   %6 = shufflevector <8 x i16> %5, <8 x i16> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
    606   %7 = shufflevector <8 x i16> %5, <8 x i16> %a0, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
    607   %8 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
    608   ret <8 x i16> %8
    609 }
    610 
    611 define <16 x i8> @shuffle_combine_packssdw_pshufb(<4 x i32> %a0) {
    612 ; SSE-LABEL: shuffle_combine_packssdw_pshufb:
    613 ; SSE:       # %bb.0:
    614 ; SSE-NEXT:    psrad $31, %xmm0
    615 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0]
    616 ; SSE-NEXT:    retq
    617 ;
    618 ; AVX-LABEL: shuffle_combine_packssdw_pshufb:
    619 ; AVX:       # %bb.0:
    620 ; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
    621 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0]
    622 ; AVX-NEXT:    retq
    623   %1 = ashr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
    624   %2 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %1)
    625   %3 = bitcast <8 x i16> %2 to <16 x i8>
    626   %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>)
    627   ret <16 x i8> %4
    628 }
    629 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
    630 
    631 define <16 x i8> @shuffle_combine_packsswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) {
    632 ; SSE-LABEL: shuffle_combine_packsswb_pshufb:
    633 ; SSE:       # %bb.0:
    634 ; SSE-NEXT:    psraw $15, %xmm0
    635 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0]
    636 ; SSE-NEXT:    retq
    637 ;
    638 ; AVX-LABEL: shuffle_combine_packsswb_pshufb:
    639 ; AVX:       # %bb.0:
    640 ; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
    641 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0]
    642 ; AVX-NEXT:    retq
    643   %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
    644   %2 = ashr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
    645   %3 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
    646   %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
    647   ret <16 x i8> %4
    648 }
    649 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
    650 
    651 define <16 x i8> @shuffle_combine_packuswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) {
    652 ; SSE-LABEL: shuffle_combine_packuswb_pshufb:
    653 ; SSE:       # %bb.0:
    654 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1]
    655 ; SSE-NEXT:    retq
    656 ;
    657 ; AVX-LABEL: shuffle_combine_packuswb_pshufb:
    658 ; AVX:       # %bb.0:
    659 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1]
    660 ; AVX-NEXT:    retq
    661   %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    662   %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    663   %3 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
    664   %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
    665   ret <16 x i8> %4
    666 }
    667 declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
    668 
    669 define <16 x i8> @constant_fold_pshufb() {
    670 ; SSE-LABEL: constant_fold_pshufb:
    671 ; SSE:       # %bb.0:
    672 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
    673 ; SSE-NEXT:    retq
    674 ;
    675 ; AVX-LABEL: constant_fold_pshufb:
    676 ; AVX:       # %bb.0:
    677 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
    678 ; AVX-NEXT:    retq
    679   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
    680   ret <16 x i8> %1
    681 }
    682 
    683 ; FIXME - unnecessary pshufb/broadcast being used - pshufb mask only needs lowest byte.
    684 define <16 x i8> @constant_fold_pshufb_2() {
    685 ; SSE-LABEL: constant_fold_pshufb_2:
    686 ; SSE:       # %bb.0:
    687 ; SSE-NEXT:    movl $2, %eax
    688 ; SSE-NEXT:    movd %eax, %xmm0
    689 ; SSE-NEXT:    pxor %xmm1, %xmm1
    690 ; SSE-NEXT:    pshufb %xmm1, %xmm0
    691 ; SSE-NEXT:    retq
    692 ;
    693 ; AVX1-LABEL: constant_fold_pshufb_2:
    694 ; AVX1:       # %bb.0:
    695 ; AVX1-NEXT:    movl $2, %eax
    696 ; AVX1-NEXT:    vmovd %eax, %xmm0
    697 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    698 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    699 ; AVX1-NEXT:    retq
    700 ;
    701 ; AVX2-LABEL: constant_fold_pshufb_2:
    702 ; AVX2:       # %bb.0:
    703 ; AVX2-NEXT:    movl $2, %eax
    704 ; AVX2-NEXT:    vmovd %eax, %xmm0
    705 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
    706 ; AVX2-NEXT:    retq
    707 ;
    708 ; AVX512F-LABEL: constant_fold_pshufb_2:
    709 ; AVX512F:       # %bb.0:
    710 ; AVX512F-NEXT:    movl $2, %eax
    711 ; AVX512F-NEXT:    vmovd %eax, %xmm0
    712 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
    713 ; AVX512F-NEXT:    retq
    714   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 2, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
    715   ret <16 x i8> %1
    716 }
    717 
    718 define i32 @mask_zzz3_v16i8(<16 x i8> %a0) {
    719 ; SSSE3-LABEL: mask_zzz3_v16i8:
    720 ; SSSE3:       # %bb.0:
    721 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u]
    722 ; SSSE3-NEXT:    movd %xmm0, %eax
    723 ; SSSE3-NEXT:    retq
    724 ;
    725 ; SSE41-LABEL: mask_zzz3_v16i8:
    726 ; SSE41:       # %bb.0:
    727 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14]
    728 ; SSE41-NEXT:    pextrd $3, %xmm0, %eax
    729 ; SSE41-NEXT:    retq
    730 ;
    731 ; AVX-LABEL: mask_zzz3_v16i8:
    732 ; AVX:       # %bb.0:
    733 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14]
    734 ; AVX-NEXT:    vpextrd $3, %xmm0, %eax
    735 ; AVX-NEXT:    retq
    736   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
    737   %2 = bitcast <16 x i8> %1 to <4 x i32>
    738   %3 = extractelement <4 x i32> %2, i32 3
    739   %4 = and i32 %3, 4278190080
    740   ret i32 %4
    741 }
    742 
    743 define i32 @mask_z1z3_v16i8(<16 x i8> %a0) {
    744 ; SSSE3-LABEL: mask_z1z3_v16i8:
    745 ; SSSE3:       # %bb.0:
    746 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[10],zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u]
    747 ; SSSE3-NEXT:    movd %xmm0, %eax
    748 ; SSSE3-NEXT:    retq
    749 ;
    750 ; SSE41-LABEL: mask_z1z3_v16i8:
    751 ; SSE41:       # %bb.0:
    752 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14]
    753 ; SSE41-NEXT:    pextrd $3, %xmm0, %eax
    754 ; SSE41-NEXT:    retq
    755 ;
    756 ; AVX-LABEL: mask_z1z3_v16i8:
    757 ; AVX:       # %bb.0:
    758 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14]
    759 ; AVX-NEXT:    vpextrd $3, %xmm0, %eax
    760 ; AVX-NEXT:    retq
    761   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
    762   %2 = bitcast <16 x i8> %1 to <4 x i32>
    763   %3 = extractelement <4 x i32> %2, i32 3
    764   %4 = and i32 %3, 4278255360
    765   ret i32 %4
    766 }
    767 
    768 define i32 @PR22415(double %a0) {
    769 ; SSE-LABEL: PR22415:
    770 ; SSE:       # %bb.0:
    771 ; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
    772 ; SSE-NEXT:    movd %xmm0, %eax
    773 ; SSE-NEXT:    retq
    774 ;
    775 ; AVX-LABEL: PR22415:
    776 ; AVX:       # %bb.0:
    777 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
    778 ; AVX-NEXT:    vmovd %xmm0, %eax
    779 ; AVX-NEXT:    retq
    780   %1 = bitcast double %a0 to <8 x i8>
    781   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 undef>
    782   %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
    783   %4 = bitcast <3 x i8> %3 to i24
    784   %5 = zext i24 %4 to i32
    785   ret i32 %5
    786 }
    787