Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
      3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
      6 
      7 ; AVX2 Logical Shift Left
      8 
      9 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
     10 ; X32-LABEL: test_sllw_1:
     11 ; X32:       # %bb.0: # %entry
     12 ; X32-NEXT:    retl
     13 ;
     14 ; X64-LABEL: test_sllw_1:
     15 ; X64:       # %bb.0: # %entry
     16 ; X64-NEXT:    retq
     17 entry:
     18   %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
     19   ret <16 x i16> %shl
     20 }
     21 
     22 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
     23 ; X32-LABEL: test_sllw_2:
     24 ; X32:       # %bb.0: # %entry
     25 ; X32-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
     26 ; X32-NEXT:    retl
     27 ;
     28 ; X64-LABEL: test_sllw_2:
     29 ; X64:       # %bb.0: # %entry
     30 ; X64-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
     31 ; X64-NEXT:    retq
     32 entry:
     33   %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
     34   ret <16 x i16> %shl
     35 }
     36 
     37 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
     38 ; X32-LABEL: test_sllw_3:
     39 ; X32:       # %bb.0: # %entry
     40 ; X32-NEXT:    vpsllw $15, %ymm0, %ymm0
     41 ; X32-NEXT:    retl
     42 ;
     43 ; X64-LABEL: test_sllw_3:
     44 ; X64:       # %bb.0: # %entry
     45 ; X64-NEXT:    vpsllw $15, %ymm0, %ymm0
     46 ; X64-NEXT:    retq
     47 entry:
     48   %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
     49   ret <16 x i16> %shl
     50 }
     51 
     52 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
     53 ; X32-LABEL: test_slld_1:
     54 ; X32:       # %bb.0: # %entry
     55 ; X32-NEXT:    retl
     56 ;
     57 ; X64-LABEL: test_slld_1:
     58 ; X64:       # %bb.0: # %entry
     59 ; X64-NEXT:    retq
     60 entry:
     61   %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
     62   ret <8 x i32> %shl
     63 }
     64 
     65 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
     66 ; X32-LABEL: test_slld_2:
     67 ; X32:       # %bb.0: # %entry
     68 ; X32-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
     69 ; X32-NEXT:    retl
     70 ;
     71 ; X64-LABEL: test_slld_2:
     72 ; X64:       # %bb.0: # %entry
     73 ; X64-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
     74 ; X64-NEXT:    retq
     75 entry:
     76   %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     77   ret <8 x i32> %shl
     78 }
     79 
     80 define <8 x i32> @test_vpslld_var(i32 %shift) {
     81 ; X32-LABEL: test_vpslld_var:
     82 ; X32:       # %bb.0:
     83 ; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
     84 ; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
     85 ; X32-NEXT:    vpslld %xmm0, %ymm1, %ymm0
     86 ; X32-NEXT:    retl
     87 ;
     88 ; X64-LABEL: test_vpslld_var:
     89 ; X64:       # %bb.0:
     90 ; X64-NEXT:    vmovd %edi, %xmm0
     91 ; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
     92 ; X64-NEXT:    vpslld %xmm0, %ymm1, %ymm0
     93 ; X64-NEXT:    retq
     94   %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
     95   %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
     96   ret <8 x i32> %tmp
     97 }
     98 
     99 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
    100 ; X32-LABEL: test_slld_3:
    101 ; X32:       # %bb.0: # %entry
    102 ; X32-NEXT:    vpslld $31, %ymm0, %ymm0
    103 ; X32-NEXT:    retl
    104 ;
    105 ; X64-LABEL: test_slld_3:
    106 ; X64:       # %bb.0: # %entry
    107 ; X64-NEXT:    vpslld $31, %ymm0, %ymm0
    108 ; X64-NEXT:    retq
    109 entry:
    110   %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
    111   ret <8 x i32> %shl
    112 }
    113 
    114 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
    115 ; X32-LABEL: test_sllq_1:
    116 ; X32:       # %bb.0: # %entry
    117 ; X32-NEXT:    retl
    118 ;
    119 ; X64-LABEL: test_sllq_1:
    120 ; X64:       # %bb.0: # %entry
    121 ; X64-NEXT:    retq
    122 entry:
    123   %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
    124   ret <4 x i64> %shl
    125 }
    126 
    127 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
    128 ; X32-LABEL: test_sllq_2:
    129 ; X32:       # %bb.0: # %entry
    130 ; X32-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
    131 ; X32-NEXT:    retl
    132 ;
    133 ; X64-LABEL: test_sllq_2:
    134 ; X64:       # %bb.0: # %entry
    135 ; X64-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
    136 ; X64-NEXT:    retq
    137 entry:
    138   %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
    139   ret <4 x i64> %shl
    140 }
    141 
    142 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
    143 ; X32-LABEL: test_sllq_3:
    144 ; X32:       # %bb.0: # %entry
    145 ; X32-NEXT:    vpsllq $63, %ymm0, %ymm0
    146 ; X32-NEXT:    retl
    147 ;
    148 ; X64-LABEL: test_sllq_3:
    149 ; X64:       # %bb.0: # %entry
    150 ; X64-NEXT:    vpsllq $63, %ymm0, %ymm0
    151 ; X64-NEXT:    retq
    152 entry:
    153   %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
    154   ret <4 x i64> %shl
    155 }
    156 
    157 ; AVX2 Arithmetic Shift
    158 
    159 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
    160 ; X32-LABEL: test_sraw_1:
    161 ; X32:       # %bb.0: # %entry
    162 ; X32-NEXT:    retl
    163 ;
    164 ; X64-LABEL: test_sraw_1:
    165 ; X64:       # %bb.0: # %entry
    166 ; X64-NEXT:    retq
    167 entry:
    168   %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
    169   ret <16 x i16> %shl
    170 }
    171 
    172 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
    173 ; X32-LABEL: test_sraw_2:
    174 ; X32:       # %bb.0: # %entry
    175 ; X32-NEXT:    vpsraw $1, %ymm0, %ymm0
    176 ; X32-NEXT:    retl
    177 ;
    178 ; X64-LABEL: test_sraw_2:
    179 ; X64:       # %bb.0: # %entry
    180 ; X64-NEXT:    vpsraw $1, %ymm0, %ymm0
    181 ; X64-NEXT:    retq
    182 entry:
    183   %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    184   ret <16 x i16> %shl
    185 }
    186 
    187 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
    188 ; X32-LABEL: test_sraw_3:
    189 ; X32:       # %bb.0: # %entry
    190 ; X32-NEXT:    vpsraw $15, %ymm0, %ymm0
    191 ; X32-NEXT:    retl
    192 ;
    193 ; X64-LABEL: test_sraw_3:
    194 ; X64:       # %bb.0: # %entry
    195 ; X64-NEXT:    vpsraw $15, %ymm0, %ymm0
    196 ; X64-NEXT:    retq
    197 entry:
    198   %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
    199   ret <16 x i16> %shl
    200 }
    201 
    202 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
    203 ; X32-LABEL: test_srad_1:
    204 ; X32:       # %bb.0: # %entry
    205 ; X32-NEXT:    retl
    206 ;
    207 ; X64-LABEL: test_srad_1:
    208 ; X64:       # %bb.0: # %entry
    209 ; X64-NEXT:    retq
    210 entry:
    211   %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    212   ret <8 x i32> %shl
    213 }
    214 
    215 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
    216 ; X32-LABEL: test_srad_2:
    217 ; X32:       # %bb.0: # %entry
    218 ; X32-NEXT:    vpsrad $1, %ymm0, %ymm0
    219 ; X32-NEXT:    retl
    220 ;
    221 ; X64-LABEL: test_srad_2:
    222 ; X64:       # %bb.0: # %entry
    223 ; X64-NEXT:    vpsrad $1, %ymm0, %ymm0
    224 ; X64-NEXT:    retq
    225 entry:
    226   %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    227   ret <8 x i32> %shl
    228 }
    229 
    230 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
    231 ; X32-LABEL: test_srad_3:
    232 ; X32:       # %bb.0: # %entry
    233 ; X32-NEXT:    vpsrad $31, %ymm0, %ymm0
    234 ; X32-NEXT:    retl
    235 ;
    236 ; X64-LABEL: test_srad_3:
    237 ; X64:       # %bb.0: # %entry
    238 ; X64-NEXT:    vpsrad $31, %ymm0, %ymm0
    239 ; X64-NEXT:    retq
    240 entry:
    241   %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
    242   ret <8 x i32> %shl
    243 }
    244 
    245 ; SSE Logical Shift Right
    246 
    247 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
    248 ; X32-LABEL: test_srlw_1:
    249 ; X32:       # %bb.0: # %entry
    250 ; X32-NEXT:    retl
    251 ;
    252 ; X64-LABEL: test_srlw_1:
    253 ; X64:       # %bb.0: # %entry
    254 ; X64-NEXT:    retq
    255 entry:
    256   %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
    257   ret <16 x i16> %shl
    258 }
    259 
    260 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
    261 ; X32-LABEL: test_srlw_2:
    262 ; X32:       # %bb.0: # %entry
    263 ; X32-NEXT:    vpsrlw $1, %ymm0, %ymm0
    264 ; X32-NEXT:    retl
    265 ;
    266 ; X64-LABEL: test_srlw_2:
    267 ; X64:       # %bb.0: # %entry
    268 ; X64-NEXT:    vpsrlw $1, %ymm0, %ymm0
    269 ; X64-NEXT:    retq
    270 entry:
    271   %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    272   ret <16 x i16> %shl
    273 }
    274 
    275 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
    276 ; X32-LABEL: test_srlw_3:
    277 ; X32:       # %bb.0: # %entry
    278 ; X32-NEXT:    vpsrlw $15, %ymm0, %ymm0
    279 ; X32-NEXT:    retl
    280 ;
    281 ; X64-LABEL: test_srlw_3:
    282 ; X64:       # %bb.0: # %entry
    283 ; X64-NEXT:    vpsrlw $15, %ymm0, %ymm0
    284 ; X64-NEXT:    retq
    285 entry:
    286   %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
    287   ret <16 x i16> %shl
    288 }
    289 
    290 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
    291 ; X32-LABEL: test_srld_1:
    292 ; X32:       # %bb.0: # %entry
    293 ; X32-NEXT:    retl
    294 ;
    295 ; X64-LABEL: test_srld_1:
    296 ; X64:       # %bb.0: # %entry
    297 ; X64-NEXT:    retq
    298 entry:
    299   %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    300   ret <8 x i32> %shl
    301 }
    302 
    303 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
    304 ; X32-LABEL: test_srld_2:
    305 ; X32:       # %bb.0: # %entry
    306 ; X32-NEXT:    vpsrld $1, %ymm0, %ymm0
    307 ; X32-NEXT:    retl
    308 ;
    309 ; X64-LABEL: test_srld_2:
    310 ; X64:       # %bb.0: # %entry
    311 ; X64-NEXT:    vpsrld $1, %ymm0, %ymm0
    312 ; X64-NEXT:    retq
    313 entry:
    314   %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    315   ret <8 x i32> %shl
    316 }
    317 
    318 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
    319 ; X32-LABEL: test_srld_3:
    320 ; X32:       # %bb.0: # %entry
    321 ; X32-NEXT:    vpsrld $31, %ymm0, %ymm0
    322 ; X32-NEXT:    retl
    323 ;
    324 ; X64-LABEL: test_srld_3:
    325 ; X64:       # %bb.0: # %entry
    326 ; X64-NEXT:    vpsrld $31, %ymm0, %ymm0
    327 ; X64-NEXT:    retq
    328 entry:
    329   %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
    330   ret <8 x i32> %shl
    331 }
    332 
    333 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
    334 ; X32-LABEL: test_srlq_1:
    335 ; X32:       # %bb.0: # %entry
    336 ; X32-NEXT:    retl
    337 ;
    338 ; X64-LABEL: test_srlq_1:
    339 ; X64:       # %bb.0: # %entry
    340 ; X64-NEXT:    retq
    341 entry:
    342   %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
    343   ret <4 x i64> %shl
    344 }
    345 
    346 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
    347 ; X32-LABEL: test_srlq_2:
    348 ; X32:       # %bb.0: # %entry
    349 ; X32-NEXT:    vpsrlq $1, %ymm0, %ymm0
    350 ; X32-NEXT:    retl
    351 ;
    352 ; X64-LABEL: test_srlq_2:
    353 ; X64:       # %bb.0: # %entry
    354 ; X64-NEXT:    vpsrlq $1, %ymm0, %ymm0
    355 ; X64-NEXT:    retq
    356 entry:
    357   %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
    358   ret <4 x i64> %shl
    359 }
    360 
    361 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
    362 ; X32-LABEL: test_srlq_3:
    363 ; X32:       # %bb.0: # %entry
    364 ; X32-NEXT:    vpsrlq $63, %ymm0, %ymm0
    365 ; X32-NEXT:    retl
    366 ;
    367 ; X64-LABEL: test_srlq_3:
    368 ; X64:       # %bb.0: # %entry
    369 ; X64-NEXT:    vpsrlq $63, %ymm0, %ymm0
    370 ; X64-NEXT:    retq
    371 entry:
    372   %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
    373   ret <4 x i64> %shl
    374 }
    375 
    376 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
    377 ; X32-SLOW-LABEL: srl_trunc_and_v4i64:
    378 ; X32-SLOW:       # %bb.0:
    379 ; X32-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    380 ; X32-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    381 ; X32-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
    382 ; X32-SLOW-NEXT:    vpand %xmm2, %xmm1, %xmm1
    383 ; X32-SLOW-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    384 ; X32-SLOW-NEXT:    vzeroupper
    385 ; X32-SLOW-NEXT:    retl
    386 ;
    387 ; X32-FAST-LABEL: srl_trunc_and_v4i64:
    388 ; X32-FAST:       # %bb.0:
    389 ; X32-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    390 ; X32-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    391 ; X32-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
    392 ; X32-FAST-NEXT:    vpand %xmm2, %xmm1, %xmm1
    393 ; X32-FAST-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    394 ; X32-FAST-NEXT:    vzeroupper
    395 ; X32-FAST-NEXT:    retl
    396 ;
    397 ; X64-SLOW-LABEL: srl_trunc_and_v4i64:
    398 ; X64-SLOW:       # %bb.0:
    399 ; X64-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    400 ; X64-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    401 ; X64-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
    402 ; X64-SLOW-NEXT:    vpand %xmm2, %xmm1, %xmm1
    403 ; X64-SLOW-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    404 ; X64-SLOW-NEXT:    vzeroupper
    405 ; X64-SLOW-NEXT:    retq
    406 ;
    407 ; X64-FAST-LABEL: srl_trunc_and_v4i64:
    408 ; X64-FAST:       # %bb.0:
    409 ; X64-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    410 ; X64-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    411 ; X64-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
    412 ; X64-FAST-NEXT:    vpand %xmm2, %xmm1, %xmm1
    413 ; X64-FAST-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    414 ; X64-FAST-NEXT:    vzeroupper
    415 ; X64-FAST-NEXT:    retq
    416   %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
    417   %trunc = trunc <4 x i64> %and to <4 x i32>
    418   %sra = lshr <4 x i32> %x, %trunc
    419   ret <4 x i32> %sra
    420 }
    421 
    422 ;
    423 ; Vectorized byte shifts
    424 ;
    425 
    426 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
    427 ; X32-LABEL: shl_8i16:
    428 ; X32:       # %bb.0:
    429 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    430 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    431 ; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
    432 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    433 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    434 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    435 ; X32-NEXT:    vzeroupper
    436 ; X32-NEXT:    retl
    437 ;
    438 ; X64-LABEL: shl_8i16:
    439 ; X64:       # %bb.0:
    440 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    441 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    442 ; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
    443 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    444 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    445 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    446 ; X64-NEXT:    vzeroupper
    447 ; X64-NEXT:    retq
    448   %shl = shl <8 x i16> %r, %a
    449   ret <8 x i16> %shl
    450 }
    451 
    452 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
    453 ; X32-LABEL: shl_16i16:
    454 ; X32:       # %bb.0:
    455 ; X32-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    456 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
    457 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
    458 ; X32-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
    459 ; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
    460 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
    461 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
    462 ; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
    463 ; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
    464 ; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
    465 ; X32-NEXT:    retl
    466 ;
    467 ; X64-LABEL: shl_16i16:
    468 ; X64:       # %bb.0:
    469 ; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    470 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
    471 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
    472 ; X64-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
    473 ; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
    474 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
    475 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
    476 ; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
    477 ; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
    478 ; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
    479 ; X64-NEXT:    retq
    480   %shl = shl <16 x i16> %r, %a
    481   ret <16 x i16> %shl
    482 }
    483 
    484 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
    485 ; X32-LABEL: shl_32i8:
    486 ; X32:       # %bb.0:
    487 ; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
    488 ; X32-NEXT:    vpsllw $4, %ymm0, %ymm2
    489 ; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
    490 ; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    491 ; X32-NEXT:    vpsllw $2, %ymm0, %ymm2
    492 ; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
    493 ; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
    494 ; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    495 ; X32-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
    496 ; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
    497 ; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    498 ; X32-NEXT:    retl
    499 ;
    500 ; X64-LABEL: shl_32i8:
    501 ; X64:       # %bb.0:
    502 ; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
    503 ; X64-NEXT:    vpsllw $4, %ymm0, %ymm2
    504 ; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
    505 ; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    506 ; X64-NEXT:    vpsllw $2, %ymm0, %ymm2
    507 ; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
    508 ; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
    509 ; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    510 ; X64-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
    511 ; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
    512 ; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    513 ; X64-NEXT:    retq
    514   %shl = shl <32 x i8> %r, %a
    515   ret <32 x i8> %shl
    516 }
    517 
    518 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
    519 ; X32-LABEL: ashr_8i16:
    520 ; X32:       # %bb.0:
    521 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    522 ; X32-NEXT:    vpmovsxwd %xmm0, %ymm0
    523 ; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
    524 ; X32-NEXT:    vextracti128 $1, %ymm0, %xmm1
    525 ; X32-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
    526 ; X32-NEXT:    vzeroupper
    527 ; X32-NEXT:    retl
    528 ;
    529 ; X64-LABEL: ashr_8i16:
    530 ; X64:       # %bb.0:
    531 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    532 ; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
    533 ; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
    534 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
    535 ; X64-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
    536 ; X64-NEXT:    vzeroupper
    537 ; X64-NEXT:    retq
    538   %ashr = ashr <8 x i16> %r, %a
    539   ret <8 x i16> %ashr
    540 }
    541 
    542 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
    543 ; X32-LABEL: ashr_16i16:
    544 ; X32:       # %bb.0:
    545 ; X32-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    546 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
    547 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
    548 ; X32-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
    549 ; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
    550 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
    551 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
    552 ; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
    553 ; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
    554 ; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
    555 ; X32-NEXT:    retl
    556 ;
    557 ; X64-LABEL: ashr_16i16:
    558 ; X64:       # %bb.0:
    559 ; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    560 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
    561 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
    562 ; X64-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
    563 ; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
    564 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
    565 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
    566 ; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
    567 ; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
    568 ; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
    569 ; X64-NEXT:    retq
    570   %ashr = ashr <16 x i16> %r, %a
    571   ret <16 x i16> %ashr
    572 }
    573 
    574 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
    575 ; X32-LABEL: ashr_32i8:
    576 ; X32:       # %bb.0:
    577 ; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
    578 ; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
    579 ; X32-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
    580 ; X32-NEXT:    vpsraw $4, %ymm3, %ymm4
    581 ; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
    582 ; X32-NEXT:    vpsraw $2, %ymm3, %ymm4
    583 ; X32-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
    584 ; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
    585 ; X32-NEXT:    vpsraw $1, %ymm3, %ymm4
    586 ; X32-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
    587 ; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
    588 ; X32-NEXT:    vpsrlw $8, %ymm2, %ymm2
    589 ; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
    590 ; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
    591 ; X32-NEXT:    vpsraw $4, %ymm0, %ymm3
    592 ; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
    593 ; X32-NEXT:    vpsraw $2, %ymm0, %ymm3
    594 ; X32-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
    595 ; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
    596 ; X32-NEXT:    vpsraw $1, %ymm0, %ymm3
    597 ; X32-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
    598 ; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
    599 ; X32-NEXT:    vpsrlw $8, %ymm0, %ymm0
    600 ; X32-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    601 ; X32-NEXT:    retl
    602 ;
    603 ; X64-LABEL: ashr_32i8:
    604 ; X64:       # %bb.0:
    605 ; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
    606 ; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
    607 ; X64-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
    608 ; X64-NEXT:    vpsraw $4, %ymm3, %ymm4
    609 ; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
    610 ; X64-NEXT:    vpsraw $2, %ymm3, %ymm4
    611 ; X64-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
    612 ; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
    613 ; X64-NEXT:    vpsraw $1, %ymm3, %ymm4
    614 ; X64-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
    615 ; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
    616 ; X64-NEXT:    vpsrlw $8, %ymm2, %ymm2
    617 ; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
    618 ; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
    619 ; X64-NEXT:    vpsraw $4, %ymm0, %ymm3
    620 ; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
    621 ; X64-NEXT:    vpsraw $2, %ymm0, %ymm3
    622 ; X64-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
    623 ; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
    624 ; X64-NEXT:    vpsraw $1, %ymm0, %ymm3
    625 ; X64-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
    626 ; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
    627 ; X64-NEXT:    vpsrlw $8, %ymm0, %ymm0
    628 ; X64-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    629 ; X64-NEXT:    retq
    630   %ashr = ashr <32 x i8> %r, %a
    631   ret <32 x i8> %ashr
    632 }
    633 
    634 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
    635 ; X32-LABEL: lshr_8i16:
    636 ; X32:       # %bb.0:
    637 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    638 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    639 ; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    640 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    641 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    642 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    643 ; X32-NEXT:    vzeroupper
    644 ; X32-NEXT:    retl
    645 ;
    646 ; X64-LABEL: lshr_8i16:
    647 ; X64:       # %bb.0:
    648 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    649 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    650 ; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    651 ; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    652 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    653 ; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    654 ; X64-NEXT:    vzeroupper
    655 ; X64-NEXT:    retq
    656   %lshr = lshr <8 x i16> %r, %a
    657   ret <8 x i16> %lshr
    658 }
    659 
    660 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
    661 ; X32-LABEL: lshr_16i16:
    662 ; X32:       # %bb.0:
    663 ; X32-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    664 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
    665 ; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
    666 ; X32-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
    667 ; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
    668 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
    669 ; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
    670 ; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    671 ; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
    672 ; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
    673 ; X32-NEXT:    retl
    674 ;
    675 ; X64-LABEL: lshr_16i16:
    676 ; X64:       # %bb.0:
    677 ; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    678 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
    679 ; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
    680 ; X64-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
    681 ; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
    682 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
    683 ; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
    684 ; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    685 ; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
    686 ; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
    687 ; X64-NEXT:    retq
    688   %lshr = lshr <16 x i16> %r, %a
    689   ret <16 x i16> %lshr
    690 }
    691 
    692 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
    693 ; X32-LABEL: lshr_32i8:
    694 ; X32:       # %bb.0:
    695 ; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
    696 ; X32-NEXT:    vpsrlw $4, %ymm0, %ymm2
    697 ; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
    698 ; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    699 ; X32-NEXT:    vpsrlw $2, %ymm0, %ymm2
    700 ; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
    701 ; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
    702 ; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    703 ; X32-NEXT:    vpsrlw $1, %ymm0, %ymm2
    704 ; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
    705 ; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
    706 ; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    707 ; X32-NEXT:    retl
    708 ;
    709 ; X64-LABEL: lshr_32i8:
    710 ; X64:       # %bb.0:
    711 ; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
    712 ; X64-NEXT:    vpsrlw $4, %ymm0, %ymm2
    713 ; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
    714 ; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    715 ; X64-NEXT:    vpsrlw $2, %ymm0, %ymm2
    716 ; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
    717 ; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
    718 ; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    719 ; X64-NEXT:    vpsrlw $1, %ymm0, %ymm2
    720 ; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
    721 ; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
    722 ; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
    723 ; X64-NEXT:    retq
    724   %lshr = lshr <32 x i8> %r, %a
    725   ret <32 x i8> %lshr
    726 }
    727