Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefix=SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefix=AVX2
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefix=AVX512
      5 
      6 ; Verify that we don't scalarize a packed vector shift left of 16-bit
      7 ; signed integers if the amount is a constant build_vector.
      8 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
      9 
     10 define <8 x i16> @test1(<8 x i16> %a) {
     11 ; SSE-LABEL: test1:
     12 ; SSE:       # BB#0:
     13 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
     14 ; SSE-NEXT:    retq
     15 ;
     16 ; AVX2-LABEL: test1:
     17 ; AVX2:       # BB#0:
     18 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
     19 ; AVX2-NEXT:    retq
     20 ;
     21 ; AVX512-LABEL: test1:
     22 ; AVX512:       # BB#0:
     23 ; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
     24 ; AVX512-NEXT:    retq
     25   %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
     26   ret <8 x i16> %shl
     27 }
     28 
     29 define <8 x i16> @test2(<8 x i16> %a) {
     30 ; SSE-LABEL: test2:
     31 ; SSE:       # BB#0:
     32 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
     33 ; SSE-NEXT:    retq
     34 ;
     35 ; AVX2-LABEL: test2:
     36 ; AVX2:       # BB#0:
     37 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
     38 ; AVX2-NEXT:    retq
     39 ;
     40 ; AVX512-LABEL: test2:
     41 ; AVX512:       # BB#0:
     42 ; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
     43 ; AVX512-NEXT:    retq
     44   %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
     45   ret <8 x i16> %shl
     46 }
     47 
     48 ; Verify that a vector shift left of 32-bit signed integers is simply expanded
     49 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
     50 ; counts is a constant build_vector.
     51 
     52 define <4 x i32> @test3(<4 x i32> %a) {
     53 ; SSE-LABEL: test3:
     54 ; SSE:       # BB#0:
     55 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
     56 ; SSE-NEXT:    retq
     57 ;
     58 ; AVX2-LABEL: test3:
     59 ; AVX2:       # BB#0:
     60 ; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
     61 ; AVX2-NEXT:    retq
     62 ;
     63 ; AVX512-LABEL: test3:
     64 ; AVX512:       # BB#0:
     65 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
     66 ; AVX512-NEXT:    retq
     67   %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
     68   ret <4 x i32> %shl
     69 }
     70 
     71 define <4 x i32> @test4(<4 x i32> %a) {
     72 ; SSE-LABEL: test4:
     73 ; SSE:       # BB#0:
     74 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
     75 ; SSE-NEXT:    retq
     76 ;
     77 ; AVX2-LABEL: test4:
     78 ; AVX2:       # BB#0:
     79 ; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
     80 ; AVX2-NEXT:    retq
     81 ;
     82 ; AVX512-LABEL: test4:
     83 ; AVX512:       # BB#0:
     84 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
     85 ; AVX512-NEXT:    retq
     86   %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
     87   ret <4 x i32> %shl
     88 }
     89 
     90 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
     91 ; into two pmullw instructions. With AVX2, the test case below would produce
     92 ; a single vpmullw.
     93 
     94 define <16 x i16> @test5(<16 x i16> %a) {
     95 ; SSE-LABEL: test5:
     96 ; SSE:       # BB#0:
     97 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
     98 ; SSE-NEXT:    pmullw %xmm2, %xmm0
     99 ; SSE-NEXT:    pmullw %xmm2, %xmm1
    100 ; SSE-NEXT:    retq
    101 ;
    102 ; AVX2-LABEL: test5:
    103 ; AVX2:       # BB#0:
    104 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
    105 ; AVX2-NEXT:    retq
    106 ;
    107 ; AVX512-LABEL: test5:
    108 ; AVX512:       # BB#0:
    109 ; AVX512-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
    110 ; AVX512-NEXT:    retq
    111   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
    112   ret <16 x i16> %shl
    113 }
    114 
    115 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
    116 ; into two pmulld instructions. With AVX2, the test case below would produce
    117 ; a single vpsllvd instead.
    118 
    119 define <8 x i32> @test6(<8 x i32> %a) {
    120 ; SSE-LABEL: test6:
    121 ; SSE:       # BB#0:
    122 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
    123 ; SSE-NEXT:    pmulld %xmm2, %xmm0
    124 ; SSE-NEXT:    pmulld %xmm2, %xmm1
    125 ; SSE-NEXT:    retq
    126 ;
    127 ; AVX2-LABEL: test6:
    128 ; AVX2:       # BB#0:
    129 ; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
    130 ; AVX2-NEXT:    retq
    131 ;
    132 ; AVX512-LABEL: test6:
    133 ; AVX512:       # BB#0:
    134 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
    135 ; AVX512-NEXT:    retq
    136   %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
    137   ret <8 x i32> %shl
    138 }
    139 
    140 ; With AVX2 and AVX512, the test case below should produce a sequence of
    141 ; two vpmullw instructions. On SSE2 instead, we split the shift in four
    142 ; parts and then we convert each part into a pmullw.
    143 
    144 define <32 x i16> @test7(<32 x i16> %a) {
    145 ; SSE-LABEL: test7:
    146 ; SSE:       # BB#0:
    147 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
    148 ; SSE-NEXT:    pmullw %xmm4, %xmm0
    149 ; SSE-NEXT:    pmullw %xmm4, %xmm1
    150 ; SSE-NEXT:    pmullw %xmm4, %xmm2
    151 ; SSE-NEXT:    pmullw %xmm4, %xmm3
    152 ; SSE-NEXT:    retq
    153 ;
    154 ; AVX2-LABEL: test7:
    155 ; AVX2:       # BB#0:
    156 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
    157 ; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
    158 ; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
    159 ; AVX2-NEXT:    retq
    160 ;
    161 ; AVX512-LABEL: test7:
    162 ; AVX512:       # BB#0:
    163 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
    164 ; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
    165 ; AVX512-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
    166 ; AVX512-NEXT:    retq
    167   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
    168   ret <32 x i16> %shl
    169 }
    170 
    171 ; Similar to test7; the difference is that with AVX512 support
    172 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
    173 
    174 define <16 x i32> @test8(<16 x i32> %a) {
    175 ; SSE-LABEL: test8:
    176 ; SSE:       # BB#0:
    177 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
    178 ; SSE-NEXT:    pmulld %xmm4, %xmm0
    179 ; SSE-NEXT:    pmulld %xmm4, %xmm1
    180 ; SSE-NEXT:    pmulld %xmm4, %xmm2
    181 ; SSE-NEXT:    pmulld %xmm4, %xmm3
    182 ; SSE-NEXT:    retq
    183 ;
    184 ; AVX2-LABEL: test8:
    185 ; AVX2:       # BB#0:
    186 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
    187 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
    188 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm1, %ymm1
    189 ; AVX2-NEXT:    retq
    190 ;
    191 ; AVX512-LABEL: test8:
    192 ; AVX512:       # BB#0:
    193 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
    194 ; AVX512-NEXT:    retq
    195   %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
    196   ret <16 x i32> %shl
    197 }
    198 
    199 ; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
    200 
    201 define <8 x i64> @test9(<8 x i64> %a) {
    202 ; SSE-LABEL: test9:
    203 ; SSE:       # BB#0:
    204 ; SSE-NEXT:    movdqa %xmm1, %xmm4
    205 ; SSE-NEXT:    psllq $3, %xmm4
    206 ; SSE-NEXT:    psllq $2, %xmm1
    207 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
    208 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    209 ; SSE-NEXT:    psllq $3, %xmm4
    210 ; SSE-NEXT:    psllq $2, %xmm3
    211 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
    212 ; SSE-NEXT:    paddq %xmm0, %xmm0
    213 ; SSE-NEXT:    paddq %xmm2, %xmm2
    214 ; SSE-NEXT:    retq
    215 ;
    216 ; AVX2-LABEL: test9:
    217 ; AVX2:       # BB#0:
    218 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
    219 ; AVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
    220 ; AVX2-NEXT:    vpsllvq %ymm2, %ymm1, %ymm1
    221 ; AVX2-NEXT:    retq
    222 ;
    223 ; AVX512-LABEL: test9:
    224 ; AVX512:       # BB#0:
    225 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
    226 ; AVX512-NEXT:    retq
    227   %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
    228   ret <8 x i64> %shl
    229 }
    230