Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST
      6 
      7 ; fold (shl 0, x) -> 0
      8 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
      9 ; SSE-LABEL: combine_vec_shl_zero:
     10 ; SSE:       # %bb.0:
     11 ; SSE-NEXT:    xorps %xmm0, %xmm0
     12 ; SSE-NEXT:    retq
     13 ;
     14 ; AVX-LABEL: combine_vec_shl_zero:
     15 ; AVX:       # %bb.0:
     16 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     17 ; AVX-NEXT:    retq
     18   %1 = shl <4 x i32> zeroinitializer, %x
     19   ret <4 x i32> %1
     20 }
     21 
     22 ; fold (shl x, c >= size(x)) -> undef
     23 define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) {
     24 ; CHECK-LABEL: combine_vec_shl_outofrange0:
     25 ; CHECK:       # %bb.0:
     26 ; CHECK-NEXT:    retq
     27   %1 = shl <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
     28   ret <4 x i32> %1
     29 }
     30 
     31 define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) {
     32 ; CHECK-LABEL: combine_vec_shl_outofrange1:
     33 ; CHECK:       # %bb.0:
     34 ; CHECK-NEXT:    retq
     35   %1 = shl <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
     36   ret <4 x i32> %1
     37 }
     38 
     39 define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) {
     40 ; CHECK-LABEL: combine_vec_shl_outofrange2:
     41 ; CHECK:       # %bb.0:
     42 ; CHECK-NEXT:    retq
     43   %1 = and <4 x i32> %a0, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
     44   %2 = shl <4 x i32> %1, <i32 33, i32 33, i32 33, i32 33>
     45   ret <4 x i32> %2
     46 }
     47 
     48 ; fold (shl x, 0) -> x
     49 define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) {
     50 ; CHECK-LABEL: combine_vec_shl_by_zero:
     51 ; CHECK:       # %bb.0:
     52 ; CHECK-NEXT:    retq
     53   %1 = shl <4 x i32> %x, zeroinitializer
     54   ret <4 x i32> %1
     55 }
     56 
     57 ; if (shl x, c) is known to be zero, return 0
     58 define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) {
     59 ; SSE-LABEL: combine_vec_shl_known_zero0:
     60 ; SSE:       # %bb.0:
     61 ; SSE-NEXT:    xorps %xmm0, %xmm0
     62 ; SSE-NEXT:    retq
     63 ;
     64 ; AVX-LABEL: combine_vec_shl_known_zero0:
     65 ; AVX:       # %bb.0:
     66 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     67 ; AVX-NEXT:    retq
     68   %1 = and <4 x i32> %x, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
     69   %2 = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
     70   ret <4 x i32> %2
     71 }
     72 
     73 define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
     74 ; SSE2-LABEL: combine_vec_shl_known_zero1:
     75 ; SSE2:       # %bb.0:
     76 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
     77 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65536,32768,16384,8192]
     78 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
     79 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
     80 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     81 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
     82 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
     83 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
     84 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     85 ; SSE2-NEXT:    retq
     86 ;
     87 ; SSE41-LABEL: combine_vec_shl_known_zero1:
     88 ; SSE41:       # %bb.0:
     89 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
     90 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
     91 ; SSE41-NEXT:    retq
     92 ;
     93 ; AVX-LABEL: combine_vec_shl_known_zero1:
     94 ; AVX:       # %bb.0:
     95 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
     96 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
     97 ; AVX-NEXT:    retq
     98   %1 = and <4 x i32> %x, <i32 4294901760, i32 8589803520, i32 17179607040, i32 34359214080>
     99   %2 = shl <4 x i32> %1, <i32 16, i32 15, i32 14, i32 13>
    100   ret <4 x i32> %2
    101 }
    102 
    103 ; fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
    104 define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
    105 ; SSE2-LABEL: combine_vec_shl_trunc_and:
    106 ; SSE2:       # %bb.0:
    107 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
    108 ; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
    109 ; SSE2-NEXT:    pslld $23, %xmm1
    110 ; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
    111 ; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
    112 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    113 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    114 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    115 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    116 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    117 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    118 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    119 ; SSE2-NEXT:    retq
    120 ;
    121 ; SSE41-LABEL: combine_vec_shl_trunc_and:
    122 ; SSE41:       # %bb.0:
    123 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
    124 ; SSE41-NEXT:    andps {{.*}}(%rip), %xmm1
    125 ; SSE41-NEXT:    pslld $23, %xmm1
    126 ; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
    127 ; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
    128 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
    129 ; SSE41-NEXT:    retq
    130 ;
    131 ; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
    132 ; AVX-SLOW:       # %bb.0:
    133 ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    134 ; AVX-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    135 ; AVX-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
    136 ; AVX-SLOW-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
    137 ; AVX-SLOW-NEXT:    vzeroupper
    138 ; AVX-SLOW-NEXT:    retq
    139 ;
    140 ; AVX-FAST-LABEL: combine_vec_shl_trunc_and:
    141 ; AVX-FAST:       # %bb.0:
    142 ; AVX-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    143 ; AVX-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    144 ; AVX-FAST-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
    145 ; AVX-FAST-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
    146 ; AVX-FAST-NEXT:    vzeroupper
    147 ; AVX-FAST-NEXT:    retq
    148   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
    149   %2 = trunc <4 x i64> %1 to <4 x i32>
    150   %3 = shl <4 x i32> %x, %2
    151   ret <4 x i32> %3
    152 }
    153 
    154 ; fold (shl (shl x, c1), c2) -> (shl x, (add c1, c2))
    155 define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) {
    156 ; SSE-LABEL: combine_vec_shl_shl0:
    157 ; SSE:       # %bb.0:
    158 ; SSE-NEXT:    pslld $6, %xmm0
    159 ; SSE-NEXT:    retq
    160 ;
    161 ; AVX-LABEL: combine_vec_shl_shl0:
    162 ; AVX:       # %bb.0:
    163 ; AVX-NEXT:    vpslld $6, %xmm0, %xmm0
    164 ; AVX-NEXT:    retq
    165   %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
    166   %2 = shl <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
    167   ret <4 x i32> %2
    168 }
    169 
    170 define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
    171 ; SSE2-LABEL: combine_vec_shl_shl1:
    172 ; SSE2:       # %bb.0:
    173 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,64,256,1024]
    174 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    175 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    176 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    177 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    178 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    179 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    180 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    181 ; SSE2-NEXT:    retq
    182 ;
    183 ; SSE41-LABEL: combine_vec_shl_shl1:
    184 ; SSE41:       # %bb.0:
    185 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    186 ; SSE41-NEXT:    retq
    187 ;
    188 ; AVX-LABEL: combine_vec_shl_shl1:
    189 ; AVX:       # %bb.0:
    190 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    191 ; AVX-NEXT:    retq
    192   %1 = shl <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
    193   %2 = shl <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
    194   ret <4 x i32> %2
    195 }
    196 
    197 ; fold (shl (shl x, c1), c2) -> 0
    198 define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) {
    199 ; SSE-LABEL: combine_vec_shl_shlr_zero0:
    200 ; SSE:       # %bb.0:
    201 ; SSE-NEXT:    xorps %xmm0, %xmm0
    202 ; SSE-NEXT:    retq
    203 ;
    204 ; AVX-LABEL: combine_vec_shl_shlr_zero0:
    205 ; AVX:       # %bb.0:
    206 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    207 ; AVX-NEXT:    retq
    208   %1 = shl <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
    209   %2 = shl <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
    210   ret <4 x i32> %2
    211 }
    212 
    213 define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) {
    214 ; SSE-LABEL: combine_vec_shl_shl_zero1:
    215 ; SSE:       # %bb.0:
    216 ; SSE-NEXT:    xorps %xmm0, %xmm0
    217 ; SSE-NEXT:    retq
    218 ;
    219 ; AVX-LABEL: combine_vec_shl_shl_zero1:
    220 ; AVX:       # %bb.0:
    221 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    222 ; AVX-NEXT:    retq
    223   %1 = shl <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
    224   %2 = shl <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
    225   ret <4 x i32> %2
    226 }
    227 
    228 ; fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2)))
    229 define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
    230 ; SSE2-LABEL: combine_vec_shl_ext_shl0:
    231 ; SSE2:       # %bb.0:
    232 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    233 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    234 ; SSE2-NEXT:    pslld $20, %xmm0
    235 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    236 ; SSE2-NEXT:    pslld $20, %xmm1
    237 ; SSE2-NEXT:    retq
    238 ;
    239 ; SSE41-LABEL: combine_vec_shl_ext_shl0:
    240 ; SSE41:       # %bb.0:
    241 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    242 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    243 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    244 ; SSE41-NEXT:    pslld $20, %xmm1
    245 ; SSE41-NEXT:    pslld $20, %xmm0
    246 ; SSE41-NEXT:    retq
    247 ;
    248 ; AVX-LABEL: combine_vec_shl_ext_shl0:
    249 ; AVX:       # %bb.0:
    250 ; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    251 ; AVX-NEXT:    vpslld $20, %ymm0, %ymm0
    252 ; AVX-NEXT:    retq
    253   %1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
    254   %2 = sext <8 x i16> %1 to <8 x i32>
    255   %3 = shl <8 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    256   ret <8 x i32> %3
    257 }
    258 
    259 define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
    260 ; SSE2-LABEL: combine_vec_shl_ext_shl1:
    261 ; SSE2:       # %bb.0:
    262 ; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm0
    263 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    264 ; SSE2-NEXT:    psrad $16, %xmm1
    265 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    266 ; SSE2-NEXT:    psrad $16, %xmm0
    267 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    268 ; SSE2-NEXT:    pslld $31, %xmm2
    269 ; SSE2-NEXT:    pslld $30, %xmm0
    270 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
    271 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    272 ; SSE2-NEXT:    pslld $29, %xmm2
    273 ; SSE2-NEXT:    pslld $28, %xmm1
    274 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
    275 ; SSE2-NEXT:    retq
    276 ;
    277 ; SSE41-LABEL: combine_vec_shl_ext_shl1:
    278 ; SSE41:       # %bb.0:
    279 ; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm0
    280 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    281 ; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
    282 ; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
    283 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    284 ; SSE41-NEXT:    pslld $30, %xmm2
    285 ; SSE41-NEXT:    pslld $31, %xmm0
    286 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
    287 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
    288 ; SSE41-NEXT:    pslld $28, %xmm2
    289 ; SSE41-NEXT:    pslld $29, %xmm1
    290 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
    291 ; SSE41-NEXT:    retq
    292 ;
    293 ; AVX-LABEL: combine_vec_shl_ext_shl1:
    294 ; AVX:       # %bb.0:
    295 ; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
    296 ; AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
    297 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
    298 ; AVX-NEXT:    retq
    299   %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
    300   %2 = sext <8 x i16> %1 to <8 x i32>
    301   %3 = shl <8 x i32> %2, <i32 31, i32 31, i32 30, i32 30, i32 29, i32 29, i32 28, i32 28>
    302   ret <8 x i32> %3
    303 }
    304 
    305 ; fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
    306 define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
    307 ; SSE2-LABEL: combine_vec_shl_zext_lshr0:
    308 ; SSE2:       # %bb.0:
    309 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    310 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    311 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    312 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    313 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    314 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    315 ; SSE2-NEXT:    retq
    316 ;
    317 ; SSE41-LABEL: combine_vec_shl_zext_lshr0:
    318 ; SSE41:       # %bb.0:
    319 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
    320 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    321 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    322 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    323 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
    324 ; SSE41-NEXT:    retq
    325 ;
    326 ; AVX-LABEL: combine_vec_shl_zext_lshr0:
    327 ; AVX:       # %bb.0:
    328 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    329 ; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    330 ; AVX-NEXT:    retq
    331   %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
    332   %2 = zext <8 x i16> %1 to <8 x i32>
    333   %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
    334   ret <8 x i32> %3
    335 }
    336 
    337 define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
    338 ; SSE2-LABEL: combine_vec_shl_zext_lshr1:
    339 ; SSE2:       # %bb.0:
    340 ; SSE2-NEXT:    pmulhuw {{.*}}(%rip), %xmm0
    341 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    342 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    343 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
    344 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    345 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,8,16]
    346 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
    347 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    348 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    349 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    350 ; SSE2-NEXT:    pmuludq %xmm3, %xmm1
    351 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    352 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    353 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32,64,128,256]
    354 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
    355 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
    356 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
    357 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
    358 ; SSE2-NEXT:    pmuludq %xmm4, %xmm2
    359 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    360 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    361 ; SSE2-NEXT:    retq
    362 ;
    363 ; SSE41-LABEL: combine_vec_shl_zext_lshr1:
    364 ; SSE41:       # %bb.0:
    365 ; SSE41-NEXT:    pmulhuw {{.*}}(%rip), %xmm0
    366 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    367 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    368 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    369 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    370 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
    371 ; SSE41-NEXT:    retq
    372 ;
    373 ; AVX-LABEL: combine_vec_shl_zext_lshr1:
    374 ; AVX:       # %bb.0:
    375 ; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
    376 ; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    377 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
    378 ; AVX-NEXT:    retq
    379   %1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
    380   %2 = zext <8 x i16> %1 to <8 x i32>
    381   %3 = shl <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    382   ret <8 x i32> %3
    383 }
    384 
    385 ; fold (shl (sr[la] exact X,  C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
    386 define <4 x i32> @combine_vec_shl_ge_ashr_extact0(<4 x i32> %x) {
    387 ; SSE-LABEL: combine_vec_shl_ge_ashr_extact0:
    388 ; SSE:       # %bb.0:
    389 ; SSE-NEXT:    pslld $2, %xmm0
    390 ; SSE-NEXT:    retq
    391 ;
    392 ; AVX-LABEL: combine_vec_shl_ge_ashr_extact0:
    393 ; AVX:       # %bb.0:
    394 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
    395 ; AVX-NEXT:    retq
    396   %1 = ashr exact <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
    397   %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
    398   ret <4 x i32> %2
    399 }
    400 
    401 define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
    402 ; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1:
    403 ; SSE2:       # %bb.0:
    404 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    405 ; SSE2-NEXT:    psrad $8, %xmm1
    406 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    407 ; SSE2-NEXT:    psrad $5, %xmm2
    408 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
    409 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    410 ; SSE2-NEXT:    psrad $4, %xmm1
    411 ; SSE2-NEXT:    psrad $3, %xmm0
    412 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    413 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
    414 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32,64,128,256]
    415 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    416 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    417 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    418 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
    419 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
    420 ; SSE2-NEXT:    pmuludq %xmm0, %xmm2
    421 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    422 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    423 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    424 ; SSE2-NEXT:    retq
    425 ;
    426 ; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1:
    427 ; SSE41:       # %bb.0:
    428 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    429 ; SSE41-NEXT:    psrad $8, %xmm1
    430 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    431 ; SSE41-NEXT:    psrad $4, %xmm2
    432 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    433 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    434 ; SSE41-NEXT:    psrad $5, %xmm1
    435 ; SSE41-NEXT:    psrad $3, %xmm0
    436 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    437 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    438 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    439 ; SSE41-NEXT:    retq
    440 ;
    441 ; AVX-LABEL: combine_vec_shl_ge_ashr_extact1:
    442 ; AVX:       # %bb.0:
    443 ; AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    444 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    445 ; AVX-NEXT:    retq
    446   %1 = ashr exact <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
    447   %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
    448   ret <4 x i32> %2
    449 }
    450 
    451 ; fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
    452 define <4 x i32> @combine_vec_shl_lt_ashr_extact0(<4 x i32> %x) {
    453 ; SSE-LABEL: combine_vec_shl_lt_ashr_extact0:
    454 ; SSE:       # %bb.0:
    455 ; SSE-NEXT:    psrad $2, %xmm0
    456 ; SSE-NEXT:    retq
    457 ;
    458 ; AVX-LABEL: combine_vec_shl_lt_ashr_extact0:
    459 ; AVX:       # %bb.0:
    460 ; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
    461 ; AVX-NEXT:    retq
    462   %1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
    463   %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
    464   ret <4 x i32> %2
    465 }
    466 
    467 define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
    468 ; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1:
    469 ; SSE2:       # %bb.0:
    470 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    471 ; SSE2-NEXT:    psrad $8, %xmm1
    472 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    473 ; SSE2-NEXT:    psrad $7, %xmm2
    474 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
    475 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    476 ; SSE2-NEXT:    psrad $6, %xmm1
    477 ; SSE2-NEXT:    psrad $5, %xmm0
    478 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    479 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
    480 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8,16,32,256]
    481 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    482 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    483 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    484 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
    485 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
    486 ; SSE2-NEXT:    pmuludq %xmm0, %xmm2
    487 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    488 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    489 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    490 ; SSE2-NEXT:    retq
    491 ;
    492 ; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1:
    493 ; SSE41:       # %bb.0:
    494 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    495 ; SSE41-NEXT:    psrad $8, %xmm1
    496 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    497 ; SSE41-NEXT:    psrad $6, %xmm2
    498 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    499 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    500 ; SSE41-NEXT:    psrad $7, %xmm1
    501 ; SSE41-NEXT:    psrad $5, %xmm0
    502 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    503 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    504 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    505 ; SSE41-NEXT:    retq
    506 ;
    507 ; AVX-LABEL: combine_vec_shl_lt_ashr_extact1:
    508 ; AVX:       # %bb.0:
    509 ; AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    510 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    511 ; AVX-NEXT:    retq
    512   %1 = ashr exact <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
    513   %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
    514   ret <4 x i32> %2
    515 }
    516 
    517 ; fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) if C2 > C1
    518 define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
    519 ; SSE-LABEL: combine_vec_shl_gt_lshr0:
    520 ; SSE:       # %bb.0:
    521 ; SSE-NEXT:    pslld $2, %xmm0
    522 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
    523 ; SSE-NEXT:    retq
    524 ;
    525 ; AVX-LABEL: combine_vec_shl_gt_lshr0:
    526 ; AVX:       # %bb.0:
    527 ; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
    528 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
    529 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    530 ; AVX-NEXT:    retq
    531   %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
    532   %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
    533   ret <4 x i32> %2
    534 }
    535 
    536 define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
    537 ; SSE2-LABEL: combine_vec_shl_gt_lshr1:
    538 ; SSE2:       # %bb.0:
    539 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    540 ; SSE2-NEXT:    psrld $8, %xmm1
    541 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    542 ; SSE2-NEXT:    psrld $5, %xmm2
    543 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
    544 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    545 ; SSE2-NEXT:    psrld $4, %xmm1
    546 ; SSE2-NEXT:    psrld $3, %xmm0
    547 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    548 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
    549 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32,64,128,256]
    550 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    551 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    552 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    553 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
    554 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
    555 ; SSE2-NEXT:    pmuludq %xmm0, %xmm2
    556 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    557 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    558 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    559 ; SSE2-NEXT:    retq
    560 ;
    561 ; SSE41-LABEL: combine_vec_shl_gt_lshr1:
    562 ; SSE41:       # %bb.0:
    563 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    564 ; SSE41-NEXT:    psrld $8, %xmm1
    565 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    566 ; SSE41-NEXT:    psrld $4, %xmm2
    567 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    568 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    569 ; SSE41-NEXT:    psrld $5, %xmm1
    570 ; SSE41-NEXT:    psrld $3, %xmm0
    571 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    572 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    573 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    574 ; SSE41-NEXT:    retq
    575 ;
    576 ; AVX-LABEL: combine_vec_shl_gt_lshr1:
    577 ; AVX:       # %bb.0:
    578 ; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    579 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    580 ; AVX-NEXT:    retq
    581   %1 = lshr <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
    582   %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
    583   ret <4 x i32> %2
    584 }
    585 
    586 ; fold (shl (srl x, c1), c2) -> (and (srl x, (sub c1, c2), MASK) if C1 >= C2
    587 define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
    588 ; SSE-LABEL: combine_vec_shl_le_lshr0:
    589 ; SSE:       # %bb.0:
    590 ; SSE-NEXT:    psrld $2, %xmm0
    591 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
    592 ; SSE-NEXT:    retq
    593 ;
    594 ; AVX-LABEL: combine_vec_shl_le_lshr0:
    595 ; AVX:       # %bb.0:
    596 ; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
    597 ; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
    598 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    599 ; AVX-NEXT:    retq
    600   %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
    601   %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
    602   ret <4 x i32> %2
    603 }
    604 
    605 define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
    606 ; SSE2-LABEL: combine_vec_shl_le_lshr1:
    607 ; SSE2:       # %bb.0:
    608 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    609 ; SSE2-NEXT:    psrld $8, %xmm1
    610 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    611 ; SSE2-NEXT:    psrld $7, %xmm2
    612 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
    613 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    614 ; SSE2-NEXT:    psrld $6, %xmm1
    615 ; SSE2-NEXT:    psrld $5, %xmm0
    616 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    617 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
    618 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8,16,32,256]
    619 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    620 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    621 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    622 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
    623 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
    624 ; SSE2-NEXT:    pmuludq %xmm0, %xmm2
    625 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    626 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    627 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    628 ; SSE2-NEXT:    retq
    629 ;
    630 ; SSE41-LABEL: combine_vec_shl_le_lshr1:
    631 ; SSE41:       # %bb.0:
    632 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    633 ; SSE41-NEXT:    psrld $8, %xmm1
    634 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    635 ; SSE41-NEXT:    psrld $6, %xmm2
    636 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    637 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    638 ; SSE41-NEXT:    psrld $7, %xmm1
    639 ; SSE41-NEXT:    psrld $5, %xmm0
    640 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    641 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    642 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    643 ; SSE41-NEXT:    retq
    644 ;
    645 ; AVX-LABEL: combine_vec_shl_le_lshr1:
    646 ; AVX:       # %bb.0:
    647 ; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    648 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    649 ; AVX-NEXT:    retq
    650   %1 = lshr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
    651   %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
    652   ret <4 x i32> %2
    653 }
    654 
    655 ; fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
    656 define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
    657 ; SSE-LABEL: combine_vec_shl_ashr0:
    658 ; SSE:       # %bb.0:
    659 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
    660 ; SSE-NEXT:    retq
    661 ;
    662 ; AVX-LABEL: combine_vec_shl_ashr0:
    663 ; AVX:       # %bb.0:
    664 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
    665 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
    666 ; AVX-NEXT:    retq
    667   %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
    668   %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
    669   ret <4 x i32> %2
    670 }
    671 
    672 define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) {
    673 ; SSE-LABEL: combine_vec_shl_ashr1:
    674 ; SSE:       # %bb.0:
    675 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
    676 ; SSE-NEXT:    retq
    677 ;
    678 ; AVX-LABEL: combine_vec_shl_ashr1:
    679 ; AVX:       # %bb.0:
    680 ; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
    681 ; AVX-NEXT:    retq
    682   %1 = ashr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
    683   %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
    684   ret <4 x i32> %2
    685 }
    686 
    687 ; fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
    688 define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
    689 ; SSE-LABEL: combine_vec_shl_add0:
    690 ; SSE:       # %bb.0:
    691 ; SSE-NEXT:    pslld $2, %xmm0
    692 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
    693 ; SSE-NEXT:    retq
    694 ;
    695 ; AVX-LABEL: combine_vec_shl_add0:
    696 ; AVX:       # %bb.0:
    697 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
    698 ; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
    699 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    700 ; AVX-NEXT:    retq
    701   %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
    702   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
    703   ret <4 x i32> %2
    704 }
    705 
    706 define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
    707 ; SSE2-LABEL: combine_vec_shl_add1:
    708 ; SSE2:       # %bb.0:
    709 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,8,16]
    710 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    711 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    712 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    713 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    714 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    715 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    716 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    717 ; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
    718 ; SSE2-NEXT:    retq
    719 ;
    720 ; SSE41-LABEL: combine_vec_shl_add1:
    721 ; SSE41:       # %bb.0:
    722 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    723 ; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
    724 ; SSE41-NEXT:    retq
    725 ;
    726 ; AVX-LABEL: combine_vec_shl_add1:
    727 ; AVX:       # %bb.0:
    728 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    729 ; AVX-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
    730 ; AVX-NEXT:    retq
    731   %1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
    732   %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
    733   ret <4 x i32> %2
    734 }
    735 
    736 ; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
    737 define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
    738 ; SSE-LABEL: combine_vec_shl_or0:
    739 ; SSE:       # %bb.0:
    740 ; SSE-NEXT:    pslld $2, %xmm0
    741 ; SSE-NEXT:    por {{.*}}(%rip), %xmm0
    742 ; SSE-NEXT:    retq
    743 ;
    744 ; AVX-LABEL: combine_vec_shl_or0:
    745 ; AVX:       # %bb.0:
    746 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
    747 ; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
    748 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
    749 ; AVX-NEXT:    retq
    750   %1 = or  <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
    751   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
    752   ret <4 x i32> %2
    753 }
    754 
    755 define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
    756 ; SSE2-LABEL: combine_vec_shl_or1:
    757 ; SSE2:       # %bb.0:
    758 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,8,16]
    759 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    760 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    761 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    762 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    763 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    764 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    765 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    766 ; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
    767 ; SSE2-NEXT:    retq
    768 ;
    769 ; SSE41-LABEL: combine_vec_shl_or1:
    770 ; SSE41:       # %bb.0:
    771 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    772 ; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
    773 ; SSE41-NEXT:    retq
    774 ;
    775 ; AVX-LABEL: combine_vec_shl_or1:
    776 ; AVX:       # %bb.0:
    777 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    778 ; AVX-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
    779 ; AVX-NEXT:    retq
    780   %1 = or  <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
    781   %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
    782   ret <4 x i32> %2
    783 }
    784 
    785 ; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
    786 define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
    787 ; SSE2-LABEL: combine_vec_shl_mul0:
    788 ; SSE2:       # %bb.0:
    789 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [20,20,20,20]
    790 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    791 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    792 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    793 ; SSE2-NEXT:    pmuludq %xmm1, %xmm2
    794 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
    795 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    796 ; SSE2-NEXT:    retq
    797 ;
    798 ; SSE41-LABEL: combine_vec_shl_mul0:
    799 ; SSE41:       # %bb.0:
    800 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    801 ; SSE41-NEXT:    retq
    802 ;
    803 ; AVX-LABEL: combine_vec_shl_mul0:
    804 ; AVX:       # %bb.0:
    805 ; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
    806 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    807 ; AVX-NEXT:    retq
    808   %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
    809   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
    810   ret <4 x i32> %2
    811 }
    812 
    813 define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
    814 ; SSE2-LABEL: combine_vec_shl_mul1:
    815 ; SSE2:       # %bb.0:
    816 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [10,24,56,128]
    817 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    818 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    819 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    820 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    821 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    822 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    823 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    824 ; SSE2-NEXT:    retq
    825 ;
    826 ; SSE41-LABEL: combine_vec_shl_mul1:
    827 ; SSE41:       # %bb.0:
    828 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
    829 ; SSE41-NEXT:    retq
    830 ;
    831 ; AVX-LABEL: combine_vec_shl_mul1:
    832 ; AVX:       # %bb.0:
    833 ; AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
    834 ; AVX-NEXT:    retq
    835   %1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
    836   %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
    837   ret <4 x i32> %2
    838 }
    839