Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST
      5 
      6 ; fold (srl 0, x) -> 0
      7 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
      8 ; SSE-LABEL: combine_vec_lshr_zero:
      9 ; SSE:       # %bb.0:
     10 ; SSE-NEXT:    xorps %xmm0, %xmm0
     11 ; SSE-NEXT:    retq
     12 ;
     13 ; AVX-LABEL: combine_vec_lshr_zero:
     14 ; AVX:       # %bb.0:
     15 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     16 ; AVX-NEXT:    retq
     17   %1 = lshr <4 x i32> zeroinitializer, %x
     18   ret <4 x i32> %1
     19 }
     20 
     21 ; fold (srl x, c >= size(x)) -> undef
     22 define <4 x i32> @combine_vec_lshr_outofrange0(<4 x i32> %x) {
     23 ; CHECK-LABEL: combine_vec_lshr_outofrange0:
     24 ; CHECK:       # %bb.0:
     25 ; CHECK-NEXT:    retq
     26   %1 = lshr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
     27   ret <4 x i32> %1
     28 }
     29 
     30 define <4 x i32> @combine_vec_lshr_outofrange1(<4 x i32> %x) {
     31 ; CHECK-LABEL: combine_vec_lshr_outofrange1:
     32 ; CHECK:       # %bb.0:
     33 ; CHECK-NEXT:    retq
     34   %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
     35   ret <4 x i32> %1
     36 }
     37 
     38 ; fold (srl x, 0) -> x
     39 define <4 x i32> @combine_vec_lshr_by_zero(<4 x i32> %x) {
     40 ; CHECK-LABEL: combine_vec_lshr_by_zero:
     41 ; CHECK:       # %bb.0:
     42 ; CHECK-NEXT:    retq
     43   %1 = lshr <4 x i32> %x, zeroinitializer
     44   ret <4 x i32> %1
     45 }
     46 
     47 ; if (srl x, c) is known to be zero, return 0
     48 define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) {
     49 ; SSE-LABEL: combine_vec_lshr_known_zero0:
     50 ; SSE:       # %bb.0:
     51 ; SSE-NEXT:    xorps %xmm0, %xmm0
     52 ; SSE-NEXT:    retq
     53 ;
     54 ; AVX-LABEL: combine_vec_lshr_known_zero0:
     55 ; AVX:       # %bb.0:
     56 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     57 ; AVX-NEXT:    retq
     58   %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15>
     59   %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
     60   ret <4 x i32> %2
     61 }
     62 
     63 define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
     64 ; SSE-LABEL: combine_vec_lshr_known_zero1:
     65 ; SSE:       # %bb.0:
     66 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
     67 ; SSE-NEXT:    movdqa %xmm0, %xmm1
     68 ; SSE-NEXT:    psrld $11, %xmm1
     69 ; SSE-NEXT:    movdqa %xmm0, %xmm2
     70 ; SSE-NEXT:    psrld $9, %xmm2
     71 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
     72 ; SSE-NEXT:    movdqa %xmm0, %xmm1
     73 ; SSE-NEXT:    psrld $10, %xmm1
     74 ; SSE-NEXT:    psrld $8, %xmm0
     75 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
     76 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
     77 ; SSE-NEXT:    retq
     78 ;
     79 ; AVX-LABEL: combine_vec_lshr_known_zero1:
     80 ; AVX:       # %bb.0:
     81 ; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
     82 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
     83 ; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
     84 ; AVX-NEXT:    retq
     85   %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15>
     86   %2 = lshr <4 x i32> %1, <i32 8, i32 9, i32 10, i32 11>
     87   ret <4 x i32> %2
     88 }
     89 
     90 ; fold (srl (srl x, c1), c2) -> (srl x, (add c1, c2))
     91 define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) {
     92 ; SSE-LABEL: combine_vec_lshr_lshr0:
     93 ; SSE:       # %bb.0:
     94 ; SSE-NEXT:    psrld $6, %xmm0
     95 ; SSE-NEXT:    retq
     96 ;
     97 ; AVX-LABEL: combine_vec_lshr_lshr0:
     98 ; AVX:       # %bb.0:
     99 ; AVX-NEXT:    vpsrld $6, %xmm0, %xmm0
    100 ; AVX-NEXT:    retq
    101   %1 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
    102   %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
    103   ret <4 x i32> %2
    104 }
    105 
    106 define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) {
    107 ; SSE-LABEL: combine_vec_lshr_lshr1:
    108 ; SSE:       # %bb.0:
    109 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    110 ; SSE-NEXT:    psrld $10, %xmm1
    111 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    112 ; SSE-NEXT:    psrld $6, %xmm2
    113 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    114 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    115 ; SSE-NEXT:    psrld $8, %xmm1
    116 ; SSE-NEXT:    psrld $4, %xmm0
    117 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    118 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    119 ; SSE-NEXT:    retq
    120 ;
    121 ; AVX-LABEL: combine_vec_lshr_lshr1:
    122 ; AVX:       # %bb.0:
    123 ; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    124 ; AVX-NEXT:    retq
    125   %1 = lshr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
    126   %2 = lshr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
    127   ret <4 x i32> %2
    128 }
    129 
    130 ; fold (srl (srl x, c1), c2) -> 0
    131 define <4 x i32> @combine_vec_lshr_lshr_zero0(<4 x i32> %x) {
    132 ; SSE-LABEL: combine_vec_lshr_lshr_zero0:
    133 ; SSE:       # %bb.0:
    134 ; SSE-NEXT:    xorps %xmm0, %xmm0
    135 ; SSE-NEXT:    retq
    136 ;
    137 ; AVX-LABEL: combine_vec_lshr_lshr_zero0:
    138 ; AVX:       # %bb.0:
    139 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    140 ; AVX-NEXT:    retq
    141   %1 = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
    142   %2 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
    143   ret <4 x i32> %2
    144 }
    145 
    146 define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) {
    147 ; SSE-LABEL: combine_vec_lshr_lshr_zero1:
    148 ; SSE:       # %bb.0:
    149 ; SSE-NEXT:    xorps %xmm0, %xmm0
    150 ; SSE-NEXT:    retq
    151 ;
    152 ; AVX-LABEL: combine_vec_lshr_lshr_zero1:
    153 ; AVX:       # %bb.0:
    154 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    155 ; AVX-NEXT:    retq
    156   %1 = lshr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
    157   %2 = lshr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
    158   ret <4 x i32> %2
    159 }
    160 
    161 ; fold (srl (trunc (srl x, c1)), c2) -> (trunc (srl x, (add c1, c2)))
    162 define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
    163 ; SSE-LABEL: combine_vec_lshr_trunc_lshr0:
    164 ; SSE:       # %bb.0:
    165 ; SSE-NEXT:    psrlq $48, %xmm1
    166 ; SSE-NEXT:    psrlq $48, %xmm0
    167 ; SSE-NEXT:    packusdw %xmm1, %xmm0
    168 ; SSE-NEXT:    retq
    169 ;
    170 ; AVX-LABEL: combine_vec_lshr_trunc_lshr0:
    171 ; AVX:       # %bb.0:
    172 ; AVX-NEXT:    vpsrlq $48, %ymm0, %ymm0
    173 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
    174 ; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    175 ; AVX-NEXT:    vzeroupper
    176 ; AVX-NEXT:    retq
    177   %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
    178   %2 = trunc <4 x i64> %1 to <4 x i32>
    179   %3 = lshr <4 x i32> %2, <i32 16, i32 16, i32 16, i32 16>
    180   ret <4 x i32> %3
    181 }
    182 
    183 define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
    184 ; SSE-LABEL: combine_vec_lshr_trunc_lshr1:
    185 ; SSE:       # %bb.0:
    186 ; SSE-NEXT:    movdqa %xmm1, %xmm2
    187 ; SSE-NEXT:    psrlq $35, %xmm2
    188 ; SSE-NEXT:    psrlq $34, %xmm1
    189 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
    190 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    191 ; SSE-NEXT:    psrlq $33, %xmm2
    192 ; SSE-NEXT:    psrlq $32, %xmm0
    193 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
    194 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    195 ; SSE-NEXT:    movaps %xmm0, %xmm1
    196 ; SSE-NEXT:    psrld $19, %xmm1
    197 ; SSE-NEXT:    movaps %xmm0, %xmm2
    198 ; SSE-NEXT:    psrld $17, %xmm2
    199 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    200 ; SSE-NEXT:    movaps %xmm0, %xmm1
    201 ; SSE-NEXT:    psrld $18, %xmm1
    202 ; SSE-NEXT:    psrld $16, %xmm0
    203 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    204 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    205 ; SSE-NEXT:    retq
    206 ;
    207 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
    208 ; AVX2-SLOW:       # %bb.0:
    209 ; AVX2-SLOW-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
    210 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    211 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    212 ; AVX2-SLOW-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    213 ; AVX2-SLOW-NEXT:    vzeroupper
    214 ; AVX2-SLOW-NEXT:    retq
    215 ;
    216 ; AVX2-FAST-LABEL: combine_vec_lshr_trunc_lshr1:
    217 ; AVX2-FAST:       # %bb.0:
    218 ; AVX2-FAST-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
    219 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    220 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    221 ; AVX2-FAST-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    222 ; AVX2-FAST-NEXT:    vzeroupper
    223 ; AVX2-FAST-NEXT:    retq
    224   %1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35>
    225   %2 = trunc <4 x i64> %1 to <4 x i32>
    226   %3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19>
    227   ret <4 x i32> %3
    228 }
    229 
    230 ; fold (srl (trunc (srl x, c1)), c2) -> 0
    231 define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) {
    232 ; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero0:
    233 ; SSE:       # %bb.0:
    234 ; SSE-NEXT:    xorps %xmm0, %xmm0
    235 ; SSE-NEXT:    retq
    236 ;
    237 ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero0:
    238 ; AVX:       # %bb.0:
    239 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    240 ; AVX-NEXT:    retq
    241   %1 = lshr <4 x i64> %x, <i64 48, i64 48, i64 48, i64 48>
    242   %2 = trunc <4 x i64> %1 to <4 x i32>
    243   %3 = lshr <4 x i32> %2, <i32 24, i32 24, i32 24, i32 24>
    244   ret <4 x i32> %3
    245 }
    246 
    247 define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
    248 ; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1:
    249 ; SSE:       # %bb.0:
    250 ; SSE-NEXT:    movdqa %xmm1, %xmm2
    251 ; SSE-NEXT:    psrlq $51, %xmm2
    252 ; SSE-NEXT:    psrlq $50, %xmm1
    253 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
    254 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    255 ; SSE-NEXT:    psrlq $49, %xmm2
    256 ; SSE-NEXT:    psrlq $48, %xmm0
    257 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
    258 ; SSE-NEXT:    packusdw %xmm1, %xmm0
    259 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    260 ; SSE-NEXT:    psrld $27, %xmm1
    261 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    262 ; SSE-NEXT:    psrld $25, %xmm2
    263 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    264 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    265 ; SSE-NEXT:    psrld $26, %xmm1
    266 ; SSE-NEXT:    psrld $24, %xmm0
    267 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    268 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    269 ; SSE-NEXT:    retq
    270 ;
    271 ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1:
    272 ; AVX:       # %bb.0:
    273 ; AVX-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
    274 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
    275 ; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    276 ; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    277 ; AVX-NEXT:    vzeroupper
    278 ; AVX-NEXT:    retq
    279   %1 = lshr <4 x i64> %x, <i64 48, i64 49, i64 50, i64 51>
    280   %2 = trunc <4 x i64> %1 to <4 x i32>
    281   %3 = lshr <4 x i32> %2, <i32 24, i32 25, i32 26, i32 27>
    282   ret <4 x i32> %3
    283 }
    284 
    285 ; fold (srl (shl x, c), c) -> (and x, cst2)
    286 define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
    287 ; SSE-LABEL: combine_vec_lshr_shl_mask0:
    288 ; SSE:       # %bb.0:
    289 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
    290 ; SSE-NEXT:    retq
    291 ;
    292 ; AVX-LABEL: combine_vec_lshr_shl_mask0:
    293 ; AVX:       # %bb.0:
    294 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
    295 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
    296 ; AVX-NEXT:    retq
    297   %1 =  shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
    298   %2 = lshr <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
    299   ret <4 x i32> %2
    300 }
    301 
    302 define <4 x i32> @combine_vec_lshr_shl_mask1(<4 x i32> %x) {
    303 ; SSE-LABEL: combine_vec_lshr_shl_mask1:
    304 ; SSE:       # %bb.0:
    305 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
    306 ; SSE-NEXT:    retq
    307 ;
    308 ; AVX-LABEL: combine_vec_lshr_shl_mask1:
    309 ; AVX:       # %bb.0:
    310 ; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
    311 ; AVX-NEXT:    retq
    312   %1 =  shl <4 x i32> %x, <i32 2, i32 3, i32 4, i32 5>
    313   %2 = lshr <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
    314   ret <4 x i32> %2
    315 }
    316 
    317 ; fold (srl (sra X, Y), 31) -> (srl X, 31)
    318 define <4 x i32> @combine_vec_lshr_ashr_sign(<4 x i32> %x, <4 x i32> %y) {
    319 ; SSE-LABEL: combine_vec_lshr_ashr_sign:
    320 ; SSE:       # %bb.0:
    321 ; SSE-NEXT:    psrld $31, %xmm0
    322 ; SSE-NEXT:    retq
    323 ;
    324 ; AVX-LABEL: combine_vec_lshr_ashr_sign:
    325 ; AVX:       # %bb.0:
    326 ; AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
    327 ; AVX-NEXT:    retq
    328   %1 = ashr <4 x i32> %x, %y
    329   %2 = lshr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
    330   ret <4 x i32> %2
    331 }
    332 
    333 ; fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
    334 define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
    335 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit0:
    336 ; SSE:       # %bb.0:
    337 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
    338 ; SSE-NEXT:    psrld $4, %xmm0
    339 ; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
    340 ; SSE-NEXT:    retq
    341 ;
    342 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit0:
    343 ; AVX:       # %bb.0:
    344 ; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16]
    345 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    346 ; AVX-NEXT:    vpsrld $4, %xmm0, %xmm0
    347 ; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
    348 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
    349 ; AVX-NEXT:    retq
    350   %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
    351   %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
    352   %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
    353   ret <4 x i32> %3
    354 }
    355 
    356 define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
    357 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
    358 ; SSE:       # %bb.0:
    359 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
    360 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    361 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    362 ; SSE-NEXT:    pand %xmm2, %xmm1
    363 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
    364 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    365 ; SSE-NEXT:    pshufb %xmm1, %xmm4
    366 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    367 ; SSE-NEXT:    psrlw $4, %xmm1
    368 ; SSE-NEXT:    pand %xmm2, %xmm1
    369 ; SSE-NEXT:    pxor %xmm2, %xmm2
    370 ; SSE-NEXT:    pshufb %xmm1, %xmm3
    371 ; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
    372 ; SSE-NEXT:    pand %xmm4, %xmm1
    373 ; SSE-NEXT:    paddb %xmm3, %xmm1
    374 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    375 ; SSE-NEXT:    pcmpeqb %xmm2, %xmm3
    376 ; SSE-NEXT:    psrlw $8, %xmm3
    377 ; SSE-NEXT:    pand %xmm1, %xmm3
    378 ; SSE-NEXT:    psrlw $8, %xmm1
    379 ; SSE-NEXT:    paddw %xmm3, %xmm1
    380 ; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
    381 ; SSE-NEXT:    psrld $16, %xmm0
    382 ; SSE-NEXT:    pand %xmm1, %xmm0
    383 ; SSE-NEXT:    psrld $16, %xmm1
    384 ; SSE-NEXT:    paddd %xmm0, %xmm1
    385 ; SSE-NEXT:    psrld $5, %xmm1
    386 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    387 ; SSE-NEXT:    retq
    388 ;
    389 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
    390 ; AVX:       # %bb.0:
    391 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    392 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    393 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    394 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
    395 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    396 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
    397 ; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
    398 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
    399 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
    400 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
    401 ; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
    402 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
    403 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
    404 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
    405 ; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
    406 ; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
    407 ; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    408 ; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
    409 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
    410 ; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
    411 ; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
    412 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
    413 ; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
    414 ; AVX-NEXT:    retq
    415   %1 = and <4 x i32> %x, <i32 4, i32 32, i32 64, i32 128>
    416   %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
    417   %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
    418   ret <4 x i32> %3
    419 }
    420 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
    421 
    422 ; fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
    423 define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
    424 ; SSE-LABEL: combine_vec_lshr_trunc_and:
    425 ; SSE:       # %bb.0:
    426 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
    427 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
    428 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
    429 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    430 ; SSE-NEXT:    psrld %xmm2, %xmm3
    431 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    432 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
    433 ; SSE-NEXT:    movdqa %xmm0, %xmm5
    434 ; SSE-NEXT:    psrld %xmm4, %xmm5
    435 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
    436 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
    437 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    438 ; SSE-NEXT:    psrld %xmm1, %xmm3
    439 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
    440 ; SSE-NEXT:    psrld %xmm1, %xmm0
    441 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
    442 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
    443 ; SSE-NEXT:    retq
    444 ;
    445 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
    446 ; AVX2-SLOW:       # %bb.0:
    447 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    448 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    449 ; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
    450 ; AVX2-SLOW-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    451 ; AVX2-SLOW-NEXT:    vzeroupper
    452 ; AVX2-SLOW-NEXT:    retq
    453 ;
    454 ; AVX2-FAST-LABEL: combine_vec_lshr_trunc_and:
    455 ; AVX2-FAST:       # %bb.0:
    456 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    457 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    458 ; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
    459 ; AVX2-FAST-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    460 ; AVX2-FAST-NEXT:    vzeroupper
    461 ; AVX2-FAST-NEXT:    retq
    462   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
    463   %2 = trunc <4 x i64> %1 to <4 x i32>
    464   %3 = lshr <4 x i32> %x, %2
    465   ret <4 x i32> %3
    466 }
    467