Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
      5 
      6 ; Verify that the following shifts are lowered into a sequence of two shifts plus
      7 ; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
      8 ; packed shift right by a constant build_vector the backend should always try to
      9 ; emit a simpler sequence of two shifts + blend when possible.
     10 
     11 define <8 x i16> @test1(<8 x i16> %a) {
     12 ; SSE-LABEL: test1:
     13 ; SSE:       # %bb.0:
     14 ; SSE-NEXT:    movdqa %xmm0, %xmm1
     15 ; SSE-NEXT:    psrlw $3, %xmm1
     16 ; SSE-NEXT:    psrlw $2, %xmm0
     17 ; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
     18 ; SSE-NEXT:    retq
     19 ;
     20 ; AVX1-LABEL: test1:
     21 ; AVX1:       # %bb.0:
     22 ; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
     23 ; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
     24 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
     25 ; AVX1-NEXT:    retq
     26 ;
     27 ; AVX2-LABEL: test1:
     28 ; AVX2:       # %bb.0:
     29 ; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm1
     30 ; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm0
     31 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
     32 ; AVX2-NEXT:    retq
     33   %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
     34   ret <8 x i16> %lshr
     35 }
     36 
     37 define <8 x i16> @test2(<8 x i16> %a) {
     38 ; SSE-LABEL: test2:
     39 ; SSE:       # %bb.0:
     40 ; SSE-NEXT:    movdqa %xmm0, %xmm1
     41 ; SSE-NEXT:    psrlw $3, %xmm1
     42 ; SSE-NEXT:    psrlw $2, %xmm0
     43 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
     44 ; SSE-NEXT:    retq
     45 ;
     46 ; AVX1-LABEL: test2:
     47 ; AVX1:       # %bb.0:
     48 ; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
     49 ; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
     50 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
     51 ; AVX1-NEXT:    retq
     52 ;
     53 ; AVX2-LABEL: test2:
     54 ; AVX2:       # %bb.0:
     55 ; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm1
     56 ; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
     57 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
     58 ; AVX2-NEXT:    retq
     59   %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
     60   ret <8 x i16> %lshr
     61 }
     62 
     63 define <4 x i32> @test3(<4 x i32> %a) {
     64 ; SSE-LABEL: test3:
     65 ; SSE:       # %bb.0:
     66 ; SSE-NEXT:    movdqa %xmm0, %xmm1
     67 ; SSE-NEXT:    psrld $3, %xmm1
     68 ; SSE-NEXT:    psrld $2, %xmm0
     69 ; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
     70 ; SSE-NEXT:    retq
     71 ;
     72 ; AVX1-LABEL: test3:
     73 ; AVX1:       # %bb.0:
     74 ; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm1
     75 ; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm0
     76 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
     77 ; AVX1-NEXT:    retq
     78 ;
     79 ; AVX2-LABEL: test3:
     80 ; AVX2:       # %bb.0:
     81 ; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
     82 ; AVX2-NEXT:    retq
     83   %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
     84   ret <4 x i32> %lshr
     85 }
     86 
     87 define <4 x i32> @test4(<4 x i32> %a) {
     88 ; SSE-LABEL: test4:
     89 ; SSE:       # %bb.0:
     90 ; SSE-NEXT:    movdqa %xmm0, %xmm1
     91 ; SSE-NEXT:    psrld $3, %xmm1
     92 ; SSE-NEXT:    psrld $2, %xmm0
     93 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
     94 ; SSE-NEXT:    retq
     95 ;
     96 ; AVX1-LABEL: test4:
     97 ; AVX1:       # %bb.0:
     98 ; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
     99 ; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm0
    100 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    101 ; AVX1-NEXT:    retq
    102 ;
    103 ; AVX2-LABEL: test4:
    104 ; AVX2:       # %bb.0:
    105 ; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    106 ; AVX2-NEXT:    retq
    107   %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
    108   ret <4 x i32> %lshr
    109 }
    110 
    111 define <8 x i16> @test5(<8 x i16> %a) {
    112 ; SSE-LABEL: test5:
    113 ; SSE:       # %bb.0:
    114 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    115 ; SSE-NEXT:    psraw $3, %xmm1
    116 ; SSE-NEXT:    psraw $2, %xmm0
    117 ; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    118 ; SSE-NEXT:    retq
    119 ;
    120 ; AVX1-LABEL: test5:
    121 ; AVX1:       # %bb.0:
    122 ; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
    123 ; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm0
    124 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
    125 ; AVX1-NEXT:    retq
    126 ;
    127 ; AVX2-LABEL: test5:
    128 ; AVX2:       # %bb.0:
    129 ; AVX2-NEXT:    vpsraw $3, %xmm0, %xmm1
    130 ; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm0
    131 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    132 ; AVX2-NEXT:    retq
    133   %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
    134   ret <8 x i16> %lshr
    135 }
    136 
    137 define <8 x i16> @test6(<8 x i16> %a) {
    138 ; SSE-LABEL: test6:
    139 ; SSE:       # %bb.0:
    140 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    141 ; SSE-NEXT:    psraw $3, %xmm1
    142 ; SSE-NEXT:    psraw $2, %xmm0
    143 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
    144 ; SSE-NEXT:    retq
    145 ;
    146 ; AVX1-LABEL: test6:
    147 ; AVX1:       # %bb.0:
    148 ; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
    149 ; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
    150 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    151 ; AVX1-NEXT:    retq
    152 ;
    153 ; AVX2-LABEL: test6:
    154 ; AVX2:       # %bb.0:
    155 ; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm1
    156 ; AVX2-NEXT:    vpsraw $3, %xmm0, %xmm0
    157 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    158 ; AVX2-NEXT:    retq
    159   %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
    160   ret <8 x i16> %lshr
    161 }
    162 
    163 define <4 x i32> @test7(<4 x i32> %a) {
    164 ; SSE-LABEL: test7:
    165 ; SSE:       # %bb.0:
    166 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    167 ; SSE-NEXT:    psrad $3, %xmm1
    168 ; SSE-NEXT:    psrad $2, %xmm0
    169 ; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    170 ; SSE-NEXT:    retq
    171 ;
    172 ; AVX1-LABEL: test7:
    173 ; AVX1:       # %bb.0:
    174 ; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm1
    175 ; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
    176 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
    177 ; AVX1-NEXT:    retq
    178 ;
    179 ; AVX2-LABEL: test7:
    180 ; AVX2:       # %bb.0:
    181 ; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    182 ; AVX2-NEXT:    retq
    183   %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
    184   ret <4 x i32> %lshr
    185 }
    186 
    187 define <4 x i32> @test8(<4 x i32> %a) {
    188 ; SSE-LABEL: test8:
    189 ; SSE:       # %bb.0:
    190 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    191 ; SSE-NEXT:    psrad $3, %xmm1
    192 ; SSE-NEXT:    psrad $2, %xmm0
    193 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
    194 ; SSE-NEXT:    retq
    195 ;
    196 ; AVX1-LABEL: test8:
    197 ; AVX1:       # %bb.0:
    198 ; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm1
    199 ; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm0
    200 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    201 ; AVX1-NEXT:    retq
    202 ;
    203 ; AVX2-LABEL: test8:
    204 ; AVX2:       # %bb.0:
    205 ; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
    206 ; AVX2-NEXT:    retq
    207   %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
    208   ret <4 x i32> %lshr
    209 }
    210 
    211 define <8 x i16> @test9(<8 x i16> %a) {
    212 ; SSE-LABEL: test9:
    213 ; SSE:       # %bb.0:
    214 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    215 ; SSE-NEXT:    psraw $3, %xmm1
    216 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]
    217 ; SSE-NEXT:    psraw $1, %xmm0
    218 ; SSE-NEXT:    pand %xmm2, %xmm0
    219 ; SSE-NEXT:    pandn %xmm1, %xmm2
    220 ; SSE-NEXT:    por %xmm2, %xmm0
    221 ; SSE-NEXT:    retq
    222 ;
    223 ; AVX-LABEL: test9:
    224 ; AVX:       # %bb.0:
    225 ; AVX-NEXT:    vpsraw $3, %xmm0, %xmm1
    226 ; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
    227 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]
    228 ; AVX-NEXT:    retq
    229   %lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
    230   ret <8 x i16> %lshr
    231 }
    232 
    233 define <8 x i32> @test10(<8 x i32>* %a) {
    234 ; SSE-LABEL: test10:
    235 ; SSE:       # %bb.0:
    236 ; SSE-NEXT:    movdqa (%rdi), %xmm0
    237 ; SSE-NEXT:    movdqa 16(%rdi), %xmm1
    238 ; SSE-NEXT:    psrad %xmm0, %xmm1
    239 ; SSE-NEXT:    psrad $1, %xmm0
    240 ; SSE-NEXT:    retq
    241 ;
    242 ; AVX1-LABEL: test10:
    243 ; AVX1:       # %bb.0:
    244 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
    245 ; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
    246 ; AVX1-NEXT:    retq
    247 ;
    248 ; AVX2-LABEL: test10:
    249 ; AVX2:       # %bb.0:
    250 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    251 ; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
    252 ; AVX2-NEXT:    retq
    253   %ld = load <8 x i32>, <8 x i32>* %a, align 32
    254   %ashr = ashr <8 x i32> %ld, <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    255   ret <8 x i32> %ashr
    256 }
    257 
    258 ; test11 vs test12 - show difference between v16i16 that is repeated/non-repeated at v8i16 level (for PBLENDW masks).
    259 
    260 define <16 x i16> @test11(<16 x i16> %a) {
    261 ; SSE-LABEL: test11:
    262 ; SSE:       # %bb.0:
    263 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
    264 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
    265 ; SSE-NEXT:    retq
    266 ;
    267 ; AVX1-LABEL: test11:
    268 ; AVX1:       # %bb.0:
    269 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    270 ; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm2
    271 ; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
    272 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
    273 ; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm2
    274 ; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
    275 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
    276 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    277 ; AVX1-NEXT:    retq
    278 ;
    279 ; AVX2-LABEL: test11:
    280 ; AVX2:       # %bb.0:
    281 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
    282 ; AVX2-NEXT:    retq
    283   %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1>
    284   ret <16 x i16> %lshr
    285 }
    286 
    287 define <16 x i16> @test12(<16 x i16> %a) {
    288 ; SSE-LABEL: test12:
    289 ; SSE:       # %bb.0:
    290 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]
    291 ; SSE-NEXT:    pmullw %xmm2, %xmm0
    292 ; SSE-NEXT:    pmullw %xmm2, %xmm1
    293 ; SSE-NEXT:    retq
    294 ;
    295 ; AVX1-LABEL: test12:
    296 ; AVX1:       # %bb.0:
    297 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    298 ; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
    299 ; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm1
    300 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
    301 ; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm2
    302 ; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
    303 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
    304 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    305 ; AVX1-NEXT:    retq
    306 ;
    307 ; AVX2-LABEL: test12:
    308 ; AVX2:       # %bb.0:
    309 ; AVX2-NEXT:    vpsllw $3, %ymm0, %ymm1
    310 ; AVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
    311 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
    312 ; AVX2-NEXT:    retq
    313   %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
    314   ret <16 x i16> %lshr
    315 }
    316