Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST
      5 
      6 ; fold (sra 0, x) -> 0
      7 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
      8 ; SSE-LABEL: combine_vec_ashr_zero:
      9 ; SSE:       # %bb.0:
     10 ; SSE-NEXT:    xorps %xmm0, %xmm0
     11 ; SSE-NEXT:    retq
     12 ;
     13 ; AVX-LABEL: combine_vec_ashr_zero:
     14 ; AVX:       # %bb.0:
     15 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     16 ; AVX-NEXT:    retq
     17   %1 = ashr <4 x i32> zeroinitializer, %x
     18   ret <4 x i32> %1
     19 }
     20 
     21 ; fold (sra -1, x) -> -1
     22 define <4 x i32> @combine_vec_ashr_allones(<4 x i32> %x) {
     23 ; SSE-LABEL: combine_vec_ashr_allones:
     24 ; SSE:       # %bb.0:
     25 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
     26 ; SSE-NEXT:    retq
     27 ;
     28 ; AVX-LABEL: combine_vec_ashr_allones:
     29 ; AVX:       # %bb.0:
     30 ; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
     31 ; AVX-NEXT:    retq
     32   %1 = ashr <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %x
     33   ret <4 x i32> %1
     34 }
     35 
     36 ; fold (sra x, c >= size(x)) -> undef
     37 define <4 x i32> @combine_vec_ashr_outofrange0(<4 x i32> %x) {
     38 ; CHECK-LABEL: combine_vec_ashr_outofrange0:
     39 ; CHECK:       # %bb.0:
     40 ; CHECK-NEXT:    retq
     41   %1 = ashr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
     42   ret <4 x i32> %1
     43 }
     44 
     45 define <4 x i32> @combine_vec_ashr_outofrange1(<4 x i32> %x) {
     46 ; CHECK-LABEL: combine_vec_ashr_outofrange1:
     47 ; CHECK:       # %bb.0:
     48 ; CHECK-NEXT:    retq
     49   %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
     50   ret <4 x i32> %1
     51 }
     52 
     53 ; fold (sra x, 0) -> x
     54 define <4 x i32> @combine_vec_ashr_by_zero(<4 x i32> %x) {
     55 ; CHECK-LABEL: combine_vec_ashr_by_zero:
     56 ; CHECK:       # %bb.0:
     57 ; CHECK-NEXT:    retq
     58   %1 = ashr <4 x i32> %x, zeroinitializer
     59   ret <4 x i32> %1
     60 }
     61 
     62 ; fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
     63 define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) {
     64 ; SSE-LABEL: combine_vec_ashr_ashr0:
     65 ; SSE:       # %bb.0:
     66 ; SSE-NEXT:    psrad $6, %xmm0
     67 ; SSE-NEXT:    retq
     68 ;
     69 ; AVX-LABEL: combine_vec_ashr_ashr0:
     70 ; AVX:       # %bb.0:
     71 ; AVX-NEXT:    vpsrad $6, %xmm0, %xmm0
     72 ; AVX-NEXT:    retq
     73   %1 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
     74   %2 = ashr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
     75   ret <4 x i32> %2
     76 }
     77 
     78 define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) {
     79 ; SSE-LABEL: combine_vec_ashr_ashr1:
     80 ; SSE:       # %bb.0:
     81 ; SSE-NEXT:    movdqa %xmm0, %xmm1
     82 ; SSE-NEXT:    psrad $10, %xmm1
     83 ; SSE-NEXT:    movdqa %xmm0, %xmm2
     84 ; SSE-NEXT:    psrad $6, %xmm2
     85 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
     86 ; SSE-NEXT:    movdqa %xmm0, %xmm1
     87 ; SSE-NEXT:    psrad $8, %xmm1
     88 ; SSE-NEXT:    psrad $4, %xmm0
     89 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
     90 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
     91 ; SSE-NEXT:    retq
     92 ;
     93 ; AVX-LABEL: combine_vec_ashr_ashr1:
     94 ; AVX:       # %bb.0:
     95 ; AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
     96 ; AVX-NEXT:    retq
     97   %1 = ashr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
     98   %2 = ashr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
     99   ret <4 x i32> %2
    100 }
    101 
    102 define <4 x i32> @combine_vec_ashr_ashr2(<4 x i32> %x) {
    103 ; SSE-LABEL: combine_vec_ashr_ashr2:
    104 ; SSE:       # %bb.0:
    105 ; SSE-NEXT:    psrad $31, %xmm0
    106 ; SSE-NEXT:    retq
    107 ;
    108 ; AVX-LABEL: combine_vec_ashr_ashr2:
    109 ; AVX:       # %bb.0:
    110 ; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
    111 ; AVX-NEXT:    retq
    112   %1 = ashr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
    113   %2 = ashr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
    114   ret <4 x i32> %2
    115 }
    116 
    117 define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
    118 ; SSE-LABEL: combine_vec_ashr_ashr3:
    119 ; SSE:       # %bb.0:
    120 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    121 ; SSE-NEXT:    psrad $27, %xmm1
    122 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    123 ; SSE-NEXT:    psrad $5, %xmm2
    124 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    125 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    126 ; SSE-NEXT:    psrad $31, %xmm1
    127 ; SSE-NEXT:    psrad $1, %xmm0
    128 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    129 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    130 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    131 ; SSE-NEXT:    psrad $10, %xmm1
    132 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
    133 ; SSE-NEXT:    psrad $31, %xmm0
    134 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    135 ; SSE-NEXT:    retq
    136 ;
    137 ; AVX-LABEL: combine_vec_ashr_ashr3:
    138 ; AVX:       # %bb.0:
    139 ; AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    140 ; AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    141 ; AVX-NEXT:    retq
    142   %1 = ashr <4 x i32> %x, <i32  1, i32  5, i32 50, i32 27>
    143   %2 = ashr <4 x i32> %1, <i32 33, i32 10, i32 33, i32  0>
    144   ret <4 x i32> %2
    145 }
    146 
    147 ; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
    148 define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
    149 ; SSE-LABEL: combine_vec_ashr_trunc_and:
    150 ; SSE:       # %bb.0:
    151 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
    152 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
    153 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
    154 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    155 ; SSE-NEXT:    psrad %xmm2, %xmm3
    156 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    157 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
    158 ; SSE-NEXT:    movdqa %xmm0, %xmm5
    159 ; SSE-NEXT:    psrad %xmm4, %xmm5
    160 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
    161 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
    162 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    163 ; SSE-NEXT:    psrad %xmm1, %xmm3
    164 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
    165 ; SSE-NEXT:    psrad %xmm1, %xmm0
    166 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
    167 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
    168 ; SSE-NEXT:    retq
    169 ;
    170 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
    171 ; AVX2-SLOW:       # %bb.0:
    172 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    173 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    174 ; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
    175 ; AVX2-SLOW-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
    176 ; AVX2-SLOW-NEXT:    vzeroupper
    177 ; AVX2-SLOW-NEXT:    retq
    178 ;
    179 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_and:
    180 ; AVX2-FAST:       # %bb.0:
    181 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    182 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    183 ; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
    184 ; AVX2-FAST-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
    185 ; AVX2-FAST-NEXT:    vzeroupper
    186 ; AVX2-FAST-NEXT:    retq
    187   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
    188   %2 = trunc <4 x i64> %1 to <4 x i32>
    189   %3 = ashr <4 x i32> %x, %2
    190   ret <4 x i32> %3
    191 }
    192 
    193 ; fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
    194 ;      if c1 is equal to the number of bits the trunc removes
    195 define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
    196 ; SSE-LABEL: combine_vec_ashr_trunc_lshr:
    197 ; SSE:       # %bb.0:
    198 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    199 ; SSE-NEXT:    movaps %xmm0, %xmm2
    200 ; SSE-NEXT:    movaps %xmm0, %xmm1
    201 ; SSE-NEXT:    psrad $2, %xmm1
    202 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    203 ; SSE-NEXT:    psrad $3, %xmm0
    204 ; SSE-NEXT:    psrad $1, %xmm2
    205 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
    206 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
    207 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    208 ; SSE-NEXT:    retq
    209 ;
    210 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
    211 ; AVX2-SLOW:       # %bb.0:
    212 ; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm0, %ymm0
    213 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    214 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    215 ; AVX2-SLOW-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    216 ; AVX2-SLOW-NEXT:    vzeroupper
    217 ; AVX2-SLOW-NEXT:    retq
    218 ;
    219 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_lshr:
    220 ; AVX2-FAST:       # %bb.0:
    221 ; AVX2-FAST-NEXT:    vpsrlq $32, %ymm0, %ymm0
    222 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    223 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    224 ; AVX2-FAST-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    225 ; AVX2-FAST-NEXT:    vzeroupper
    226 ; AVX2-FAST-NEXT:    retq
    227   %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
    228   %2 = trunc <4 x i64> %1 to <4 x i32>
    229   %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
    230   ret <4 x i32> %3
    231 }
    232 
    233 ; fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
    234 ;      if c1 is equal to the number of bits the trunc removes
    235 define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
    236 ; SSE-LABEL: combine_vec_ashr_trunc_ashr:
    237 ; SSE:       # %bb.0:
    238 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    239 ; SSE-NEXT:    movaps %xmm0, %xmm2
    240 ; SSE-NEXT:    movaps %xmm0, %xmm1
    241 ; SSE-NEXT:    psrad $2, %xmm1
    242 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    243 ; SSE-NEXT:    psrad $3, %xmm0
    244 ; SSE-NEXT:    psrad $1, %xmm2
    245 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
    246 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
    247 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    248 ; SSE-NEXT:    retq
    249 ;
    250 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
    251 ; AVX2-SLOW:       # %bb.0:
    252 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
    253 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    254 ; AVX2-SLOW-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    255 ; AVX2-SLOW-NEXT:    vzeroupper
    256 ; AVX2-SLOW-NEXT:    retq
    257 ;
    258 ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr:
    259 ; AVX2-FAST:       # %bb.0:
    260 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7]
    261 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    262 ; AVX2-FAST-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    263 ; AVX2-FAST-NEXT:    vzeroupper
    264 ; AVX2-FAST-NEXT:    retq
    265   %1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
    266   %2 = trunc <4 x i64> %1 to <4 x i32>
    267   %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
    268   ret <4 x i32> %3
    269 }
    270 
    271 ; If the sign bit is known to be zero, switch this to a SRL.
    272 define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
    273 ; SSE-LABEL: combine_vec_ashr_positive:
    274 ; SSE:       # %bb.0:
    275 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
    276 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
    277 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    278 ; SSE-NEXT:    psrld %xmm2, %xmm3
    279 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    280 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
    281 ; SSE-NEXT:    movdqa %xmm0, %xmm5
    282 ; SSE-NEXT:    psrld %xmm4, %xmm5
    283 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
    284 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
    285 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    286 ; SSE-NEXT:    psrld %xmm1, %xmm3
    287 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
    288 ; SSE-NEXT:    psrld %xmm1, %xmm0
    289 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
    290 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
    291 ; SSE-NEXT:    retq
    292 ;
    293 ; AVX-LABEL: combine_vec_ashr_positive:
    294 ; AVX:       # %bb.0:
    295 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    296 ; AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    297 ; AVX-NEXT:    retq
    298   %1 = and <4 x i32> %x, <i32 15, i32 255, i32 4095, i32 65535>
    299   %2 = ashr <4 x i32> %1, %y
    300   ret <4 x i32> %2
    301 }
    302 
    303 define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) {
    304 ; SSE-LABEL: combine_vec_ashr_positive_splat:
    305 ; SSE:       # %bb.0:
    306 ; SSE-NEXT:    xorps %xmm0, %xmm0
    307 ; SSE-NEXT:    retq
    308 ;
    309 ; AVX-LABEL: combine_vec_ashr_positive_splat:
    310 ; AVX:       # %bb.0:
    311 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    312 ; AVX-NEXT:    retq
    313   %1 = and <4 x i32> %x, <i32 1023, i32 1023, i32 1023, i32 1023>
    314   %2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10>
    315   ret <4 x i32> %2
    316 }
    317