Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
      5 
      6 ; fold (urem x, 1) -> 0
      7 define i32 @combine_urem_by_one(i32 %x) {
      8 ; CHECK-LABEL: combine_urem_by_one:
      9 ; CHECK:       # %bb.0:
     10 ; CHECK-NEXT:    xorl %eax, %eax
     11 ; CHECK-NEXT:    retq
     12   %1 = urem i32 %x, 1
     13   ret i32 %1
     14 }
     15 
     16 define <4 x i32> @combine_vec_urem_by_one(<4 x i32> %x) {
     17 ; SSE-LABEL: combine_vec_urem_by_one:
     18 ; SSE:       # %bb.0:
     19 ; SSE-NEXT:    xorps %xmm0, %xmm0
     20 ; SSE-NEXT:    retq
     21 ;
     22 ; AVX-LABEL: combine_vec_urem_by_one:
     23 ; AVX:       # %bb.0:
     24 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     25 ; AVX-NEXT:    retq
     26   %1 = urem <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
     27   ret <4 x i32> %1
     28 }
     29 
     30 ; fold (urem x, -1) -> select((icmp eq x, -1), 0, x)
     31 define i32 @combine_urem_by_negone(i32 %x) {
     32 ; CHECK-LABEL: combine_urem_by_negone:
     33 ; CHECK:       # %bb.0:
     34 ; CHECK-NEXT:    xorl %eax, %eax
     35 ; CHECK-NEXT:    cmpl $-1, %edi
     36 ; CHECK-NEXT:    cmovnel %edi, %eax
     37 ; CHECK-NEXT:    retq
     38   %1 = urem i32 %x, -1
     39   ret i32 %1
     40 }
     41 
     42 define <4 x i32> @combine_vec_urem_by_negone(<4 x i32> %x) {
     43 ; SSE-LABEL: combine_vec_urem_by_negone:
     44 ; SSE:       # %bb.0:
     45 ; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
     46 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
     47 ; SSE-NEXT:    pandn %xmm0, %xmm1
     48 ; SSE-NEXT:    movdqa %xmm1, %xmm0
     49 ; SSE-NEXT:    retq
     50 ;
     51 ; AVX-LABEL: combine_vec_urem_by_negone:
     52 ; AVX:       # %bb.0:
     53 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
     54 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1
     55 ; AVX-NEXT:    vpandn %xmm0, %xmm1, %xmm0
     56 ; AVX-NEXT:    retq
     57   %1 = urem <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
     58   ret <4 x i32> %1
     59 }
     60 
     61 ; fold (urem x, INT_MIN) -> (and x, ~INT_MIN)
     62 define i32 @combine_urem_by_minsigned(i32 %x) {
     63 ; CHECK-LABEL: combine_urem_by_minsigned:
     64 ; CHECK:       # %bb.0:
     65 ; CHECK-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
     66 ; CHECK-NEXT:    movl %edi, %eax
     67 ; CHECK-NEXT:    retq
     68   %1 = urem i32 %x, -2147483648
     69   ret i32 %1
     70 }
     71 
     72 define <4 x i32> @combine_vec_urem_by_minsigned(<4 x i32> %x) {
     73 ; SSE-LABEL: combine_vec_urem_by_minsigned:
     74 ; SSE:       # %bb.0:
     75 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
     76 ; SSE-NEXT:    retq
     77 ;
     78 ; AVX1-LABEL: combine_vec_urem_by_minsigned:
     79 ; AVX1:       # %bb.0:
     80 ; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
     81 ; AVX1-NEXT:    retq
     82 ;
     83 ; AVX2-LABEL: combine_vec_urem_by_minsigned:
     84 ; AVX2:       # %bb.0:
     85 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647]
     86 ; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
     87 ; AVX2-NEXT:    retq
     88   %1 = urem <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
     89   ret <4 x i32> %1
     90 }
     91 
     92 ; TODO fold (urem x, x) -> 0
     93 define i32 @combine_urem_dupe(i32 %x) {
     94 ; CHECK-LABEL: combine_urem_dupe:
     95 ; CHECK:       # %bb.0:
     96 ; CHECK-NEXT:    xorl %edx, %edx
     97 ; CHECK-NEXT:    movl %edi, %eax
     98 ; CHECK-NEXT:    divl %edi
     99 ; CHECK-NEXT:    movl %edx, %eax
    100 ; CHECK-NEXT:    retq
    101   %1 = urem i32 %x, %x
    102   ret i32 %1
    103 }
    104 
    105 define <4 x i32> @combine_vec_urem_dupe(<4 x i32> %x) {
    106 ; SSE-LABEL: combine_vec_urem_dupe:
    107 ; SSE:       # %bb.0:
    108 ; SSE-NEXT:    pextrd $1, %xmm0, %eax
    109 ; SSE-NEXT:    xorl %edx, %edx
    110 ; SSE-NEXT:    divl %eax
    111 ; SSE-NEXT:    movl %edx, %ecx
    112 ; SSE-NEXT:    movd %xmm0, %eax
    113 ; SSE-NEXT:    xorl %edx, %edx
    114 ; SSE-NEXT:    divl %eax
    115 ; SSE-NEXT:    movd %edx, %xmm1
    116 ; SSE-NEXT:    pinsrd $1, %ecx, %xmm1
    117 ; SSE-NEXT:    pextrd $2, %xmm0, %eax
    118 ; SSE-NEXT:    xorl %edx, %edx
    119 ; SSE-NEXT:    divl %eax
    120 ; SSE-NEXT:    pinsrd $2, %edx, %xmm1
    121 ; SSE-NEXT:    pextrd $3, %xmm0, %eax
    122 ; SSE-NEXT:    xorl %edx, %edx
    123 ; SSE-NEXT:    divl %eax
    124 ; SSE-NEXT:    pinsrd $3, %edx, %xmm1
    125 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    126 ; SSE-NEXT:    retq
    127 ;
    128 ; AVX-LABEL: combine_vec_urem_dupe:
    129 ; AVX:       # %bb.0:
    130 ; AVX-NEXT:    vpextrd $1, %xmm0, %eax
    131 ; AVX-NEXT:    xorl %edx, %edx
    132 ; AVX-NEXT:    divl %eax
    133 ; AVX-NEXT:    movl %edx, %ecx
    134 ; AVX-NEXT:    vmovd %xmm0, %eax
    135 ; AVX-NEXT:    xorl %edx, %edx
    136 ; AVX-NEXT:    divl %eax
    137 ; AVX-NEXT:    vmovd %edx, %xmm1
    138 ; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
    139 ; AVX-NEXT:    vpextrd $2, %xmm0, %eax
    140 ; AVX-NEXT:    xorl %edx, %edx
    141 ; AVX-NEXT:    divl %eax
    142 ; AVX-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
    143 ; AVX-NEXT:    vpextrd $3, %xmm0, %eax
    144 ; AVX-NEXT:    xorl %edx, %edx
    145 ; AVX-NEXT:    divl %eax
    146 ; AVX-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm0
    147 ; AVX-NEXT:    retq
    148   %1 = urem <4 x i32> %x, %x
    149   ret <4 x i32> %1
    150 }
    151 
    152 ; fold (urem x, pow2) -> (and x, (pow2-1))
    153 define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) {
    154 ; SSE-LABEL: combine_vec_urem_by_pow2a:
    155 ; SSE:       # %bb.0:
    156 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
    157 ; SSE-NEXT:    retq
    158 ;
    159 ; AVX1-LABEL: combine_vec_urem_by_pow2a:
    160 ; AVX1:       # %bb.0:
    161 ; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
    162 ; AVX1-NEXT:    retq
    163 ;
    164 ; AVX2-LABEL: combine_vec_urem_by_pow2a:
    165 ; AVX2:       # %bb.0:
    166 ; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
    167 ; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
    168 ; AVX2-NEXT:    retq
    169   %1 = urem <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
    170   ret <4 x i32> %1
    171 }
    172 
    173 define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) {
    174 ; SSE-LABEL: combine_vec_urem_by_pow2b:
    175 ; SSE:       # %bb.0:
    176 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
    177 ; SSE-NEXT:    retq
    178 ;
    179 ; AVX-LABEL: combine_vec_urem_by_pow2b:
    180 ; AVX:       # %bb.0:
    181 ; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
    182 ; AVX-NEXT:    retq
    183   %1 = urem <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
    184   ret <4 x i32> %1
    185 }
    186 
    187 define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
    188 ; SSE-LABEL: combine_vec_urem_by_pow2c:
    189 ; SSE:       # %bb.0:
    190 ; SSE-NEXT:    pslld $23, %xmm1
    191 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
    192 ; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
    193 ; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
    194 ; SSE-NEXT:    paddd %xmm1, %xmm2
    195 ; SSE-NEXT:    pand %xmm2, %xmm0
    196 ; SSE-NEXT:    retq
    197 ;
    198 ; AVX1-LABEL: combine_vec_urem_by_pow2c:
    199 ; AVX1:       # %bb.0:
    200 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
    201 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
    202 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
    203 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    204 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    205 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    206 ; AVX1-NEXT:    retq
    207 ;
    208 ; AVX2-LABEL: combine_vec_urem_by_pow2c:
    209 ; AVX2:       # %bb.0:
    210 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
    211 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
    212 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    213 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    214 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    215 ; AVX2-NEXT:    retq
    216   %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
    217   %2 = urem <4 x i32> %x, %1
    218   ret <4 x i32> %2
    219 }
    220 
    221 define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
    222 ; SSE-LABEL: combine_vec_urem_by_pow2d:
    223 ; SSE:       # %bb.0:
    224 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
    225 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
    226 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    227 ; SSE-NEXT:    psrld %xmm2, %xmm4
    228 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    229 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
    230 ; SSE-NEXT:    movdqa %xmm3, %xmm6
    231 ; SSE-NEXT:    psrld %xmm5, %xmm6
    232 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
    233 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
    234 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    235 ; SSE-NEXT:    psrld %xmm1, %xmm4
    236 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
    237 ; SSE-NEXT:    psrld %xmm1, %xmm3
    238 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
    239 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
    240 ; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
    241 ; SSE-NEXT:    paddd %xmm3, %xmm1
    242 ; SSE-NEXT:    pand %xmm1, %xmm0
    243 ; SSE-NEXT:    retq
    244 ;
    245 ; AVX1-LABEL: combine_vec_urem_by_pow2d:
    246 ; AVX1:       # %bb.0:
    247 ; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    248 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
    249 ; AVX1-NEXT:    vpsrld %xmm2, %xmm3, %xmm2
    250 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
    251 ; AVX1-NEXT:    vpsrld %xmm4, %xmm3, %xmm4
    252 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
    253 ; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
    254 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
    255 ; AVX1-NEXT:    vpsrld %xmm4, %xmm3, %xmm4
    256 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    257 ; AVX1-NEXT:    vpsrld %xmm1, %xmm3, %xmm1
    258 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
    259 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
    260 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    261 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    262 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    263 ; AVX1-NEXT:    retq
    264 ;
    265 ; AVX2-LABEL: combine_vec_urem_by_pow2d:
    266 ; AVX2:       # %bb.0:
    267 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
    268 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm2, %xmm1
    269 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    270 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    271 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    272 ; AVX2-NEXT:    retq
    273   %1 = lshr <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %y
    274   %2 = urem <4 x i32> %x, %1
    275   ret <4 x i32> %2
    276 }
    277 
    278 ; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
    279 define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
    280 ; SSE-LABEL: combine_vec_urem_by_shl_pow2a:
    281 ; SSE:       # %bb.0:
    282 ; SSE-NEXT:    pslld $23, %xmm1
    283 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
    284 ; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
    285 ; SSE-NEXT:    pslld $2, %xmm1
    286 ; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
    287 ; SSE-NEXT:    paddd %xmm1, %xmm2
    288 ; SSE-NEXT:    pand %xmm2, %xmm0
    289 ; SSE-NEXT:    retq
    290 ;
    291 ; AVX1-LABEL: combine_vec_urem_by_shl_pow2a:
    292 ; AVX1:       # %bb.0:
    293 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
    294 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
    295 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
    296 ; AVX1-NEXT:    vpslld $2, %xmm1, %xmm1
    297 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    298 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    299 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    300 ; AVX1-NEXT:    retq
    301 ;
    302 ; AVX2-LABEL: combine_vec_urem_by_shl_pow2a:
    303 ; AVX2:       # %bb.0:
    304 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
    305 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
    306 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    307 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    308 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    309 ; AVX2-NEXT:    retq
    310   %1 = shl <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y
    311   %2 = urem <4 x i32> %x, %1
    312   ret <4 x i32> %2
    313 }
    314 
    315 define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
    316 ; SSE-LABEL: combine_vec_urem_by_shl_pow2b:
    317 ; SSE:       # %bb.0:
    318 ; SSE-NEXT:    pslld $23, %xmm1
    319 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
    320 ; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
    321 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm1
    322 ; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
    323 ; SSE-NEXT:    paddd %xmm1, %xmm2
    324 ; SSE-NEXT:    pand %xmm2, %xmm0
    325 ; SSE-NEXT:    retq
    326 ;
    327 ; AVX1-LABEL: combine_vec_urem_by_shl_pow2b:
    328 ; AVX1:       # %bb.0:
    329 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
    330 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
    331 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
    332 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
    333 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    334 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    335 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    336 ; AVX1-NEXT:    retq
    337 ;
    338 ; AVX2-LABEL: combine_vec_urem_by_shl_pow2b:
    339 ; AVX2:       # %bb.0:
    340 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,4,8,16]
    341 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
    342 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    343 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    344 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    345 ; AVX2-NEXT:    retq
    346   %1 = shl <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y
    347   %2 = urem <4 x i32> %x, %1
    348   ret <4 x i32> %2
    349 }
    350