Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
      4 
      5 ; fold (mul x, 0) -> 0
      6 define <4 x i32> @combine_vec_mul_zero(<4 x i32> %x) {
      7 ; SSE-LABEL: combine_vec_mul_zero:
      8 ; SSE:       # %bb.0:
      9 ; SSE-NEXT:    xorps %xmm0, %xmm0
     10 ; SSE-NEXT:    retq
     11 ;
     12 ; AVX-LABEL: combine_vec_mul_zero:
     13 ; AVX:       # %bb.0:
     14 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     15 ; AVX-NEXT:    retq
     16   %1 = mul <4 x i32> %x, zeroinitializer
     17   ret <4 x i32> %1
     18 }
     19 
     20 ; fold (mul x, 1) -> x
     21 define <4 x i32> @combine_vec_mul_one(<4 x i32> %x) {
     22 ; SSE-LABEL: combine_vec_mul_one:
     23 ; SSE:       # %bb.0:
     24 ; SSE-NEXT:    retq
     25 ;
     26 ; AVX-LABEL: combine_vec_mul_one:
     27 ; AVX:       # %bb.0:
     28 ; AVX-NEXT:    retq
     29   %1 = mul <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
     30   ret <4 x i32> %1
     31 }
     32 
     33 ; fold (mul x, -1) -> 0-x
     34 define <4 x i32> @combine_vec_mul_negone(<4 x i32> %x) {
     35 ; SSE-LABEL: combine_vec_mul_negone:
     36 ; SSE:       # %bb.0:
     37 ; SSE-NEXT:    pxor %xmm1, %xmm1
     38 ; SSE-NEXT:    psubd %xmm0, %xmm1
     39 ; SSE-NEXT:    movdqa %xmm1, %xmm0
     40 ; SSE-NEXT:    retq
     41 ;
     42 ; AVX-LABEL: combine_vec_mul_negone:
     43 ; AVX:       # %bb.0:
     44 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     45 ; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
     46 ; AVX-NEXT:    retq
     47   %1 = mul <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
     48   ret <4 x i32> %1
     49 }
     50 
     51 ; fold (mul x, (1 << c)) -> x << c
     52 define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
     53 ; SSE-LABEL: combine_vec_mul_pow2a:
     54 ; SSE:       # %bb.0:
     55 ; SSE-NEXT:    paddd %xmm0, %xmm0
     56 ; SSE-NEXT:    retq
     57 ;
     58 ; AVX-LABEL: combine_vec_mul_pow2a:
     59 ; AVX:       # %bb.0:
     60 ; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
     61 ; AVX-NEXT:    retq
     62   %1 = mul <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
     63   ret <4 x i32> %1
     64 }
     65 
     66 define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
     67 ; SSE-LABEL: combine_vec_mul_pow2b:
     68 ; SSE:       # %bb.0:
     69 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
     70 ; SSE-NEXT:    retq
     71 ;
     72 ; AVX-LABEL: combine_vec_mul_pow2b:
     73 ; AVX:       # %bb.0:
     74 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
     75 ; AVX-NEXT:    retq
     76   %1 = mul <4 x i32> %x, <i32 1, i32 2, i32 4, i32 16>
     77   ret <4 x i32> %1
     78 }
     79 
     80 define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
     81 ; SSE-LABEL: combine_vec_mul_pow2c:
     82 ; SSE:       # %bb.0:
     83 ; SSE-NEXT:    movdqa %xmm0, %xmm2
     84 ; SSE-NEXT:    psllq $1, %xmm2
     85 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
     86 ; SSE-NEXT:    movdqa %xmm1, %xmm0
     87 ; SSE-NEXT:    psllq $4, %xmm0
     88 ; SSE-NEXT:    psllq $2, %xmm1
     89 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
     90 ; SSE-NEXT:    movdqa %xmm2, %xmm0
     91 ; SSE-NEXT:    retq
     92 ;
     93 ; AVX-LABEL: combine_vec_mul_pow2c:
     94 ; AVX:       # %bb.0:
     95 ; AVX-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
     96 ; AVX-NEXT:    retq
     97   %1 = mul <4 x i64> %x, <i64 1, i64 2, i64 4, i64 16>
     98   ret <4 x i64> %1
     99 }
    100 
    101 ; fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
    102 define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
    103 ; SSE-LABEL: combine_vec_mul_negpow2a:
    104 ; SSE:       # %bb.0:
    105 ; SSE-NEXT:    paddd %xmm0, %xmm0
    106 ; SSE-NEXT:    pxor %xmm1, %xmm1
    107 ; SSE-NEXT:    psubd %xmm0, %xmm1
    108 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    109 ; SSE-NEXT:    retq
    110 ;
    111 ; AVX-LABEL: combine_vec_mul_negpow2a:
    112 ; AVX:       # %bb.0:
    113 ; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
    114 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    115 ; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
    116 ; AVX-NEXT:    retq
    117   %1 = mul <4 x i32> %x, <i32 -2, i32 -2, i32 -2, i32 -2>
    118   ret <4 x i32> %1
    119 }
    120 
    121 define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
    122 ; SSE-LABEL: combine_vec_mul_negpow2b:
    123 ; SSE:       # %bb.0:
    124 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
    125 ; SSE-NEXT:    retq
    126 ;
    127 ; AVX-LABEL: combine_vec_mul_negpow2b:
    128 ; AVX:       # %bb.0:
    129 ; AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
    130 ; AVX-NEXT:    retq
    131   %1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16>
    132   ret <4 x i32> %1
    133 }
    134 
    135 define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
    136 ; SSE-LABEL: combine_vec_mul_negpow2c:
    137 ; SSE:       # %bb.0:
    138 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
    139 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    140 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
    141 ; SSE-NEXT:    movdqa %xmm0, %xmm4
    142 ; SSE-NEXT:    psrlq $32, %xmm4
    143 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614]
    144 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
    145 ; SSE-NEXT:    paddq %xmm3, %xmm4
    146 ; SSE-NEXT:    psllq $32, %xmm4
    147 ; SSE-NEXT:    pmuludq %xmm5, %xmm0
    148 ; SSE-NEXT:    paddq %xmm4, %xmm0
    149 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
    150 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    151 ; SSE-NEXT:    psrlq $32, %xmm3
    152 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600]
    153 ; SSE-NEXT:    pmuludq %xmm4, %xmm3
    154 ; SSE-NEXT:    paddq %xmm2, %xmm3
    155 ; SSE-NEXT:    psllq $32, %xmm3
    156 ; SSE-NEXT:    pmuludq %xmm4, %xmm1
    157 ; SSE-NEXT:    paddq %xmm3, %xmm1
    158 ; SSE-NEXT:    retq
    159 ;
    160 ; AVX-LABEL: combine_vec_mul_negpow2c:
    161 ; AVX:       # %bb.0:
    162 ; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
    163 ; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
    164 ; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm2
    165 ; AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600]
    166 ; AVX-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
    167 ; AVX-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
    168 ; AVX-NEXT:    vpsllq $32, %ymm1, %ymm1
    169 ; AVX-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
    170 ; AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
    171 ; AVX-NEXT:    retq
    172   %1 = mul <4 x i64> %x, <i64 -1, i64 -2, i64 -4, i64 -16>
    173   ret <4 x i64> %1
    174 }
    175 
    176 ; (mul (shl X, c1), c2) -> (mul X, c2 << c1)
    177 define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
    178 ; SSE-LABEL: combine_vec_mul_shl_const:
    179 ; SSE:       # %bb.0:
    180 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
    181 ; SSE-NEXT:    retq
    182 ;
    183 ; AVX-LABEL: combine_vec_mul_shl_const:
    184 ; AVX:       # %bb.0:
    185 ; AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
    186 ; AVX-NEXT:    retq
    187   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
    188   %2 = mul <4 x i32> %1, <i32 1, i32 3, i32 5, i32 7>
    189   ret <4 x i32> %2
    190 }
    191 
    192 ; (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one use.
    193 define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
    194 ; SSE-LABEL: combine_vec_mul_shl_oneuse0:
    195 ; SSE:       # %bb.0:
    196 ; SSE-NEXT:    pmulld %xmm1, %xmm0
    197 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
    198 ; SSE-NEXT:    retq
    199 ;
    200 ; AVX-LABEL: combine_vec_mul_shl_oneuse0:
    201 ; AVX:       # %bb.0:
    202 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    203 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    204 ; AVX-NEXT:    retq
    205   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
    206   %2 = mul <4 x i32> %1, %y
    207   ret <4 x i32> %2
    208 }
    209 
    210 define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
    211 ; SSE-LABEL: combine_vec_mul_shl_oneuse1:
    212 ; SSE:       # %bb.0:
    213 ; SSE-NEXT:    pmulld %xmm1, %xmm0
    214 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
    215 ; SSE-NEXT:    retq
    216 ;
    217 ; AVX-LABEL: combine_vec_mul_shl_oneuse1:
    218 ; AVX:       # %bb.0:
    219 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    220 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    221 ; AVX-NEXT:    retq
    222   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
    223   %2 = mul <4 x i32> %y, %1
    224   ret <4 x i32> %2
    225 }
    226 
    227 define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
    228 ; SSE-LABEL: combine_vec_mul_shl_multiuse0:
    229 ; SSE:       # %bb.0:
    230 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
    231 ; SSE-NEXT:    pmulld %xmm0, %xmm1
    232 ; SSE-NEXT:    paddd %xmm1, %xmm0
    233 ; SSE-NEXT:    retq
    234 ;
    235 ; AVX-LABEL: combine_vec_mul_shl_multiuse0:
    236 ; AVX:       # %bb.0:
    237 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    238 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
    239 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    240 ; AVX-NEXT:    retq
    241   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
    242   %2 = mul <4 x i32> %1, %y
    243   %3 = add <4 x i32> %1, %2
    244   ret <4 x i32> %3
    245 }
    246 
    247 define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
    248 ; SSE-LABEL: combine_vec_mul_shl_multiuse1:
    249 ; SSE:       # %bb.0:
    250 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
    251 ; SSE-NEXT:    pmulld %xmm0, %xmm1
    252 ; SSE-NEXT:    paddd %xmm1, %xmm0
    253 ; SSE-NEXT:    retq
    254 ;
    255 ; AVX-LABEL: combine_vec_mul_shl_multiuse1:
    256 ; AVX:       # %bb.0:
    257 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
    258 ; AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
    259 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    260 ; AVX-NEXT:    retq
    261   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
    262   %2 = mul <4 x i32> %y, %1
    263   %3 = add <4 x i32> %1, %2
    264   ret <4 x i32> %3
    265 }
    266 
    267 ; fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
    268 
    269 define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
    270 ; SSE-LABEL: combine_vec_mul_add:
    271 ; SSE:       # %bb.0:
    272 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
    273 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
    274 ; SSE-NEXT:    retq
    275 ;
    276 ; AVX-LABEL: combine_vec_mul_add:
    277 ; AVX:       # %bb.0:
    278 ; AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
    279 ; AVX-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
    280 ; AVX-NEXT:    retq
    281   %1 = add <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
    282   %2 = mul <4 x i32> %1, <i32 4, i32 6, i32 2, i32 0>
    283   ret <4 x i32> %2
    284 }
    285 
    286 ; This would infinite loop because DAGCombiner wants to turn this into a shift,
    287 ; but x86 lowering wants to avoid non-uniform vector shift amounts.
    288 
    289 define <16 x i8> @PR35579(<16 x i8> %x) {
    290 ; SSE-LABEL: PR35579:
    291 ; SSE:       # %bb.0:
    292 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1]
    293 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
    294 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    295 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
    296 ; SSE-NEXT:    pmullw %xmm2, %xmm0
    297 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
    298 ; SSE-NEXT:    pand %xmm2, %xmm0
    299 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
    300 ; SSE-NEXT:    pand %xmm2, %xmm1
    301 ; SSE-NEXT:    packuswb %xmm0, %xmm1
    302 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    303 ; SSE-NEXT:    retq
    304 ;
    305 ; AVX-LABEL: PR35579:
    306 ; AVX:       # %bb.0:
    307 ; AVX-NEXT:    vpmovsxbw %xmm0, %ymm0
    308 ; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
    309 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
    310 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    311 ; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    312 ; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    313 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    314 ; AVX-NEXT:    vzeroupper
    315 ; AVX-NEXT:    retq
    316   %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1>
    317   ret <16 x i8> %r
    318 }
    319 
    320