Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
      6 
      7 ; Verify that we don't scalarize a packed vector shift left of 16-bit
      8 ; signed integers if the amount is a constant build_vector.
      9 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
     10 
     11 define <8 x i16> @test1(<8 x i16> %a) {
     12 ; SSE-LABEL: test1:
     13 ; SSE:       # %bb.0:
     14 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
     15 ; SSE-NEXT:    retq
     16 ;
     17 ; AVX-LABEL: test1:
     18 ; AVX:       # %bb.0:
     19 ; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
     20 ; AVX-NEXT:    retq
     21   %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
     22   ret <8 x i16> %shl
     23 }
     24 
     25 define <8 x i16> @test2(<8 x i16> %a) {
     26 ; SSE-LABEL: test2:
     27 ; SSE:       # %bb.0:
     28 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
     29 ; SSE-NEXT:    retq
     30 ;
     31 ; AVX-LABEL: test2:
     32 ; AVX:       # %bb.0:
     33 ; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
     34 ; AVX-NEXT:    retq
     35   %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
     36   ret <8 x i16> %shl
     37 }
     38 
     39 ; Verify that a vector shift left of 32-bit signed integers is simply expanded
     40 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
     41 ; counts is a constant build_vector.
     42 
     43 define <4 x i32> @test3(<4 x i32> %a) {
     44 ; SSE2-LABEL: test3:
     45 ; SSE2:       # %bb.0:
     46 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
     47 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
     48 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
     49 ; SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
     50 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     51 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     52 ; SSE2-NEXT:    retq
     53 ;
     54 ; SSE41-LABEL: test3:
     55 ; SSE41:       # %bb.0:
     56 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
     57 ; SSE41-NEXT:    retq
     58 ;
     59 ; AVX-LABEL: test3:
     60 ; AVX:       # %bb.0:
     61 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
     62 ; AVX-NEXT:    retq
     63   %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
     64   ret <4 x i32> %shl
     65 }
     66 
     67 define <4 x i32> @test4(<4 x i32> %a) {
     68 ; SSE2-LABEL: test4:
     69 ; SSE2:       # %bb.0:
     70 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
     71 ; SSE2-NEXT:    pslld $1, %xmm1
     72 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
     73 ; SSE2-NEXT:    movapd %xmm1, %xmm0
     74 ; SSE2-NEXT:    retq
     75 ;
     76 ; SSE41-LABEL: test4:
     77 ; SSE41:       # %bb.0:
     78 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
     79 ; SSE41-NEXT:    pslld $1, %xmm1
     80 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
     81 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
     82 ; SSE41-NEXT:    retq
     83 ;
     84 ; AVX-LABEL: test4:
     85 ; AVX:       # %bb.0:
     86 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
     87 ; AVX-NEXT:    retq
     88   %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
     89   ret <4 x i32> %shl
     90 }
     91 
     92 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
     93 ; into two pmullw instructions. With AVX2, the test case below would produce
     94 ; a single vpmullw.
     95 
     96 define <16 x i16> @test5(<16 x i16> %a) {
     97 ; SSE-LABEL: test5:
     98 ; SSE:       # %bb.0:
     99 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
    100 ; SSE-NEXT:    pmullw %xmm2, %xmm0
    101 ; SSE-NEXT:    pmullw %xmm2, %xmm1
    102 ; SSE-NEXT:    retq
    103 ;
    104 ; AVX-LABEL: test5:
    105 ; AVX:       # %bb.0:
    106 ; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
    107 ; AVX-NEXT:    retq
    108   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
    109   ret <16 x i16> %shl
    110 }
    111 
    112 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
    113 ; into two pmulld instructions. With AVX2, the test case below would produce
    114 ; a single vpsllvd instead.
    115 
    116 define <8 x i32> @test6(<8 x i32> %a) {
    117 ; SSE2-LABEL: test6:
    118 ; SSE2:       # %bb.0:
    119 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
    120 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
    121 ; SSE2-NEXT:    pmuludq %xmm2, %xmm0
    122 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    123 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
    124 ; SSE2-NEXT:    pmuludq %xmm4, %xmm3
    125 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
    126 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
    127 ; SSE2-NEXT:    pmuludq %xmm1, %xmm2
    128 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    129 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    130 ; SSE2-NEXT:    pmuludq %xmm4, %xmm1
    131 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    132 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    133 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
    134 ; SSE2-NEXT:    retq
    135 ;
    136 ; SSE41-LABEL: test6:
    137 ; SSE41:       # %bb.0:
    138 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
    139 ; SSE41-NEXT:    pmulld %xmm2, %xmm0
    140 ; SSE41-NEXT:    pmulld %xmm2, %xmm1
    141 ; SSE41-NEXT:    retq
    142 ;
    143 ; AVX-LABEL: test6:
    144 ; AVX:       # %bb.0:
    145 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
    146 ; AVX-NEXT:    retq
    147   %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
    148   ret <8 x i32> %shl
    149 }
    150 
    151 ; With AVX2 and AVX512, the test case below should produce a sequence of
    152 ; two vpmullw instructions. On SSE2 instead, we split the shift in four
    153 ; parts and then we convert each part into a pmullw.
    154 
    155 define <32 x i16> @test7(<32 x i16> %a) {
    156 ; SSE-LABEL: test7:
    157 ; SSE:       # %bb.0:
    158 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
    159 ; SSE-NEXT:    pmullw %xmm4, %xmm0
    160 ; SSE-NEXT:    pmullw %xmm4, %xmm1
    161 ; SSE-NEXT:    pmullw %xmm4, %xmm2
    162 ; SSE-NEXT:    pmullw %xmm4, %xmm3
    163 ; SSE-NEXT:    retq
    164 ;
    165 ; AVX-LABEL: test7:
    166 ; AVX:       # %bb.0:
    167 ; AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
    168 ; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
    169 ; AVX-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
    170 ; AVX-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
    171 ; AVX-NEXT:    retq
    172   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
    173   ret <32 x i16> %shl
    174 }
    175 
    176 ; Similar to test7; the difference is that with AVX512 support
    177 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
    178 
    179 define <16 x i32> @test8(<16 x i32> %a) {
    180 ; SSE2-LABEL: test8:
    181 ; SSE2:       # %bb.0:
    182 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
    183 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
    184 ; SSE2-NEXT:    pmuludq %xmm4, %xmm0
    185 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    186 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
    187 ; SSE2-NEXT:    pmuludq %xmm6, %xmm5
    188 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
    189 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
    190 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
    191 ; SSE2-NEXT:    pmuludq %xmm4, %xmm1
    192 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    193 ; SSE2-NEXT:    pmuludq %xmm6, %xmm5
    194 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
    195 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
    196 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
    197 ; SSE2-NEXT:    pmuludq %xmm4, %xmm2
    198 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    199 ; SSE2-NEXT:    pmuludq %xmm6, %xmm5
    200 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
    201 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
    202 ; SSE2-NEXT:    pmuludq %xmm3, %xmm4
    203 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
    204 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
    205 ; SSE2-NEXT:    pmuludq %xmm6, %xmm3
    206 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
    207 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    208 ; SSE2-NEXT:    movdqa %xmm4, %xmm3
    209 ; SSE2-NEXT:    retq
    210 ;
    211 ; SSE41-LABEL: test8:
    212 ; SSE41:       # %bb.0:
    213 ; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
    214 ; SSE41-NEXT:    pmulld %xmm4, %xmm0
    215 ; SSE41-NEXT:    pmulld %xmm4, %xmm1
    216 ; SSE41-NEXT:    pmulld %xmm4, %xmm2
    217 ; SSE41-NEXT:    pmulld %xmm4, %xmm3
    218 ; SSE41-NEXT:    retq
    219 ;
    220 ; AVX2-LABEL: test8:
    221 ; AVX2:       # %bb.0:
    222 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
    223 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
    224 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
    225 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm1, %ymm1
    226 ; AVX2-NEXT:    retq
    227 ;
    228 ; AVX512-LABEL: test8:
    229 ; AVX512:       # %bb.0:
    230 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
    231 ; AVX512-NEXT:    retq
    232   %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
    233   ret <16 x i32> %shl
    234 }
    235 
    236 ; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
    237 
    238 define <8 x i64> @test9(<8 x i64> %a) {
    239 ; SSE2-LABEL: test9:
    240 ; SSE2:       # %bb.0:
    241 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
    242 ; SSE2-NEXT:    psllq $2, %xmm4
    243 ; SSE2-NEXT:    psllq $3, %xmm1
    244 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
    245 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    246 ; SSE2-NEXT:    psllq $2, %xmm4
    247 ; SSE2-NEXT:    psllq $3, %xmm3
    248 ; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
    249 ; SSE2-NEXT:    paddq %xmm0, %xmm0
    250 ; SSE2-NEXT:    paddq %xmm2, %xmm2
    251 ; SSE2-NEXT:    retq
    252 ;
    253 ; SSE41-LABEL: test9:
    254 ; SSE41:       # %bb.0:
    255 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
    256 ; SSE41-NEXT:    psllq $3, %xmm4
    257 ; SSE41-NEXT:    psllq $2, %xmm1
    258 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
    259 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
    260 ; SSE41-NEXT:    psllq $3, %xmm4
    261 ; SSE41-NEXT:    psllq $2, %xmm3
    262 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
    263 ; SSE41-NEXT:    paddq %xmm0, %xmm0
    264 ; SSE41-NEXT:    paddq %xmm2, %xmm2
    265 ; SSE41-NEXT:    retq
    266 ;
    267 ; AVX2-LABEL: test9:
    268 ; AVX2:       # %bb.0:
    269 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
    270 ; AVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
    271 ; AVX2-NEXT:    vpsllvq %ymm2, %ymm1, %ymm1
    272 ; AVX2-NEXT:    retq
    273 ;
    274 ; AVX512-LABEL: test9:
    275 ; AVX512:       # %bb.0:
    276 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
    277 ; AVX512-NEXT:    retq
    278   %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
    279   ret <8 x i64> %shl
    280 }
    281