Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
     11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
     12 ;
     13 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
     14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
     15 
     16 ;
     17 ; Variable Shifts
     18 ;
     19 
     20 define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
     21 ; SSE2-LABEL: var_shift_v2i64:
     22 ; SSE2:       # %bb.0:
     23 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
     24 ; SSE2-NEXT:    psrlq %xmm1, %xmm2
     25 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     26 ; SSE2-NEXT:    psrlq %xmm1, %xmm0
     27 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
     28 ; SSE2-NEXT:    retq
     29 ;
     30 ; SSE41-LABEL: var_shift_v2i64:
     31 ; SSE41:       # %bb.0:
     32 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
     33 ; SSE41-NEXT:    psrlq %xmm1, %xmm2
     34 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     35 ; SSE41-NEXT:    psrlq %xmm1, %xmm0
     36 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
     37 ; SSE41-NEXT:    retq
     38 ;
     39 ; AVX1-LABEL: var_shift_v2i64:
     40 ; AVX1:       # %bb.0:
     41 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
     42 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     43 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
     44 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
     45 ; AVX1-NEXT:    retq
     46 ;
     47 ; AVX2-LABEL: var_shift_v2i64:
     48 ; AVX2:       # %bb.0:
     49 ; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
     50 ; AVX2-NEXT:    retq
     51 ;
     52 ; XOPAVX1-LABEL: var_shift_v2i64:
     53 ; XOPAVX1:       # %bb.0:
     54 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
     55 ; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
     56 ; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
     57 ; XOPAVX1-NEXT:    retq
     58 ;
     59 ; XOPAVX2-LABEL: var_shift_v2i64:
     60 ; XOPAVX2:       # %bb.0:
     61 ; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
     62 ; XOPAVX2-NEXT:    retq
     63 ;
     64 ; AVX512-LABEL: var_shift_v2i64:
     65 ; AVX512:       # %bb.0:
     66 ; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
     67 ; AVX512-NEXT:    retq
     68 ;
     69 ; AVX512VL-LABEL: var_shift_v2i64:
     70 ; AVX512VL:       # %bb.0:
     71 ; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
     72 ; AVX512VL-NEXT:    retq
     73 ;
     74 ; X32-SSE-LABEL: var_shift_v2i64:
     75 ; X32-SSE:       # %bb.0:
     76 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
     77 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
     78 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     79 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
     80 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
     81 ; X32-SSE-NEXT:    retl
     82   %shift = lshr <2 x i64> %a, %b
     83   ret <2 x i64> %shift
     84 }
     85 
     86 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
     87 ; SSE2-LABEL: var_shift_v4i32:
     88 ; SSE2:       # %bb.0:
     89 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
     90 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
     91 ; SSE2-NEXT:    psrld %xmm2, %xmm3
     92 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
     93 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
     94 ; SSE2-NEXT:    psrld %xmm4, %xmm2
     95 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
     96 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     97 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
     98 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
     99 ; SSE2-NEXT:    psrld %xmm3, %xmm4
    100 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
    101 ; SSE2-NEXT:    psrld %xmm1, %xmm0
    102 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
    103 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
    104 ; SSE2-NEXT:    movaps %xmm2, %xmm0
    105 ; SSE2-NEXT:    retq
    106 ;
    107 ; SSE41-LABEL: var_shift_v4i32:
    108 ; SSE41:       # %bb.0:
    109 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
    110 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
    111 ; SSE41-NEXT:    psrld %xmm2, %xmm3
    112 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    113 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
    114 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
    115 ; SSE41-NEXT:    psrld %xmm4, %xmm5
    116 ; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
    117 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
    118 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
    119 ; SSE41-NEXT:    psrld %xmm1, %xmm3
    120 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
    121 ; SSE41-NEXT:    psrld %xmm1, %xmm0
    122 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
    123 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
    124 ; SSE41-NEXT:    retq
    125 ;
    126 ; AVX1-LABEL: var_shift_v4i32:
    127 ; AVX1:       # %bb.0:
    128 ; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    129 ; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
    130 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
    131 ; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
    132 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
    133 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    134 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
    135 ; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
    136 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    137 ; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
    138 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
    139 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    140 ; AVX1-NEXT:    retq
    141 ;
    142 ; AVX2-LABEL: var_shift_v4i32:
    143 ; AVX2:       # %bb.0:
    144 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    145 ; AVX2-NEXT:    retq
    146 ;
    147 ; XOPAVX1-LABEL: var_shift_v4i32:
    148 ; XOPAVX1:       # %bb.0:
    149 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    150 ; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
    151 ; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
    152 ; XOPAVX1-NEXT:    retq
    153 ;
    154 ; XOPAVX2-LABEL: var_shift_v4i32:
    155 ; XOPAVX2:       # %bb.0:
    156 ; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    157 ; XOPAVX2-NEXT:    retq
    158 ;
    159 ; AVX512-LABEL: var_shift_v4i32:
    160 ; AVX512:       # %bb.0:
    161 ; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    162 ; AVX512-NEXT:    retq
    163 ;
    164 ; AVX512VL-LABEL: var_shift_v4i32:
    165 ; AVX512VL:       # %bb.0:
    166 ; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    167 ; AVX512VL-NEXT:    retq
    168 ;
    169 ; X32-SSE-LABEL: var_shift_v4i32:
    170 ; X32-SSE:       # %bb.0:
    171 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
    172 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
    173 ; X32-SSE-NEXT:    psrld %xmm2, %xmm3
    174 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
    175 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
    176 ; X32-SSE-NEXT:    psrld %xmm4, %xmm2
    177 ; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
    178 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    179 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
    180 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
    181 ; X32-SSE-NEXT:    psrld %xmm3, %xmm4
    182 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
    183 ; X32-SSE-NEXT:    psrld %xmm1, %xmm0
    184 ; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
    185 ; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
    186 ; X32-SSE-NEXT:    movaps %xmm2, %xmm0
    187 ; X32-SSE-NEXT:    retl
    188   %shift = lshr <4 x i32> %a, %b
    189   ret <4 x i32> %shift
    190 }
    191 
    192 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
    193 ; SSE2-LABEL: var_shift_v8i16:
    194 ; SSE2:       # %bb.0:
    195 ; SSE2-NEXT:    psllw $12, %xmm1
    196 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    197 ; SSE2-NEXT:    psraw $15, %xmm2
    198 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
    199 ; SSE2-NEXT:    pandn %xmm0, %xmm3
    200 ; SSE2-NEXT:    psrlw $8, %xmm0
    201 ; SSE2-NEXT:    pand %xmm2, %xmm0
    202 ; SSE2-NEXT:    por %xmm3, %xmm0
    203 ; SSE2-NEXT:    paddw %xmm1, %xmm1
    204 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    205 ; SSE2-NEXT:    psraw $15, %xmm2
    206 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
    207 ; SSE2-NEXT:    pandn %xmm0, %xmm3
    208 ; SSE2-NEXT:    psrlw $4, %xmm0
    209 ; SSE2-NEXT:    pand %xmm2, %xmm0
    210 ; SSE2-NEXT:    por %xmm3, %xmm0
    211 ; SSE2-NEXT:    paddw %xmm1, %xmm1
    212 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    213 ; SSE2-NEXT:    psraw $15, %xmm2
    214 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
    215 ; SSE2-NEXT:    pandn %xmm0, %xmm3
    216 ; SSE2-NEXT:    psrlw $2, %xmm0
    217 ; SSE2-NEXT:    pand %xmm2, %xmm0
    218 ; SSE2-NEXT:    por %xmm3, %xmm0
    219 ; SSE2-NEXT:    paddw %xmm1, %xmm1
    220 ; SSE2-NEXT:    psraw $15, %xmm1
    221 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    222 ; SSE2-NEXT:    pandn %xmm0, %xmm2
    223 ; SSE2-NEXT:    psrlw $1, %xmm0
    224 ; SSE2-NEXT:    pand %xmm1, %xmm0
    225 ; SSE2-NEXT:    por %xmm2, %xmm0
    226 ; SSE2-NEXT:    retq
    227 ;
    228 ; SSE41-LABEL: var_shift_v8i16:
    229 ; SSE41:       # %bb.0:
    230 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    231 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    232 ; SSE41-NEXT:    psllw $12, %xmm0
    233 ; SSE41-NEXT:    psllw $4, %xmm1
    234 ; SSE41-NEXT:    por %xmm0, %xmm1
    235 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
    236 ; SSE41-NEXT:    paddw %xmm1, %xmm3
    237 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
    238 ; SSE41-NEXT:    psrlw $8, %xmm4
    239 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    240 ; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
    241 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
    242 ; SSE41-NEXT:    psrlw $4, %xmm1
    243 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    244 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
    245 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
    246 ; SSE41-NEXT:    psrlw $2, %xmm1
    247 ; SSE41-NEXT:    paddw %xmm3, %xmm3
    248 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    249 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
    250 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
    251 ; SSE41-NEXT:    psrlw $1, %xmm1
    252 ; SSE41-NEXT:    paddw %xmm3, %xmm3
    253 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    254 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
    255 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
    256 ; SSE41-NEXT:    retq
    257 ;
    258 ; AVX1-LABEL: var_shift_v8i16:
    259 ; AVX1:       # %bb.0:
    260 ; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
    261 ; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
    262 ; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
    263 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
    264 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
    265 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
    266 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
    267 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    268 ; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
    269 ; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
    270 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    271 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
    272 ; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
    273 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    274 ; AVX1-NEXT:    retq
    275 ;
    276 ; AVX2-LABEL: var_shift_v8i16:
    277 ; AVX2:       # %bb.0:
    278 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    279 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    280 ; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    281 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    282 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    283 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    284 ; AVX2-NEXT:    vzeroupper
    285 ; AVX2-NEXT:    retq
    286 ;
    287 ; XOP-LABEL: var_shift_v8i16:
    288 ; XOP:       # %bb.0:
    289 ; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    290 ; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
    291 ; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
    292 ; XOP-NEXT:    retq
    293 ;
    294 ; AVX512DQ-LABEL: var_shift_v8i16:
    295 ; AVX512DQ:       # %bb.0:
    296 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    297 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    298 ; AVX512DQ-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    299 ; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
    300 ; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    301 ; AVX512DQ-NEXT:    vzeroupper
    302 ; AVX512DQ-NEXT:    retq
    303 ;
    304 ; AVX512BW-LABEL: var_shift_v8i16:
    305 ; AVX512BW:       # %bb.0:
    306 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
    307 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    308 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
    309 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    310 ; AVX512BW-NEXT:    vzeroupper
    311 ; AVX512BW-NEXT:    retq
    312 ;
    313 ; AVX512DQVL-LABEL: var_shift_v8i16:
    314 ; AVX512DQVL:       # %bb.0:
    315 ; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    316 ; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    317 ; AVX512DQVL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    318 ; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
    319 ; AVX512DQVL-NEXT:    vzeroupper
    320 ; AVX512DQVL-NEXT:    retq
    321 ;
    322 ; AVX512BWVL-LABEL: var_shift_v8i16:
    323 ; AVX512BWVL:       # %bb.0:
    324 ; AVX512BWVL-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
    325 ; AVX512BWVL-NEXT:    retq
    326 ;
    327 ; X32-SSE-LABEL: var_shift_v8i16:
    328 ; X32-SSE:       # %bb.0:
    329 ; X32-SSE-NEXT:    psllw $12, %xmm1
    330 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
    331 ; X32-SSE-NEXT:    psraw $15, %xmm2
    332 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
    333 ; X32-SSE-NEXT:    pandn %xmm0, %xmm3
    334 ; X32-SSE-NEXT:    psrlw $8, %xmm0
    335 ; X32-SSE-NEXT:    pand %xmm2, %xmm0
    336 ; X32-SSE-NEXT:    por %xmm3, %xmm0
    337 ; X32-SSE-NEXT:    paddw %xmm1, %xmm1
    338 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
    339 ; X32-SSE-NEXT:    psraw $15, %xmm2
    340 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
    341 ; X32-SSE-NEXT:    pandn %xmm0, %xmm3
    342 ; X32-SSE-NEXT:    psrlw $4, %xmm0
    343 ; X32-SSE-NEXT:    pand %xmm2, %xmm0
    344 ; X32-SSE-NEXT:    por %xmm3, %xmm0
    345 ; X32-SSE-NEXT:    paddw %xmm1, %xmm1
    346 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
    347 ; X32-SSE-NEXT:    psraw $15, %xmm2
    348 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
    349 ; X32-SSE-NEXT:    pandn %xmm0, %xmm3
    350 ; X32-SSE-NEXT:    psrlw $2, %xmm0
    351 ; X32-SSE-NEXT:    pand %xmm2, %xmm0
    352 ; X32-SSE-NEXT:    por %xmm3, %xmm0
    353 ; X32-SSE-NEXT:    paddw %xmm1, %xmm1
    354 ; X32-SSE-NEXT:    psraw $15, %xmm1
    355 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
    356 ; X32-SSE-NEXT:    pandn %xmm0, %xmm2
    357 ; X32-SSE-NEXT:    psrlw $1, %xmm0
    358 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
    359 ; X32-SSE-NEXT:    por %xmm2, %xmm0
    360 ; X32-SSE-NEXT:    retl
    361   %shift = lshr <8 x i16> %a, %b
    362   ret <8 x i16> %shift
    363 }
    364 
    365 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
    366 ; SSE2-LABEL: var_shift_v16i8:
    367 ; SSE2:       # %bb.0:
    368 ; SSE2-NEXT:    psllw $5, %xmm1
    369 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    370 ; SSE2-NEXT:    pxor %xmm3, %xmm3
    371 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
    372 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    373 ; SSE2-NEXT:    pandn %xmm0, %xmm4
    374 ; SSE2-NEXT:    psrlw $4, %xmm0
    375 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    376 ; SSE2-NEXT:    pand %xmm3, %xmm0
    377 ; SSE2-NEXT:    por %xmm4, %xmm0
    378 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    379 ; SSE2-NEXT:    pxor %xmm3, %xmm3
    380 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
    381 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    382 ; SSE2-NEXT:    pandn %xmm0, %xmm4
    383 ; SSE2-NEXT:    psrlw $2, %xmm0
    384 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    385 ; SSE2-NEXT:    pand %xmm3, %xmm0
    386 ; SSE2-NEXT:    por %xmm4, %xmm0
    387 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    388 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
    389 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
    390 ; SSE2-NEXT:    pandn %xmm0, %xmm1
    391 ; SSE2-NEXT:    psrlw $1, %xmm0
    392 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    393 ; SSE2-NEXT:    pand %xmm2, %xmm0
    394 ; SSE2-NEXT:    por %xmm1, %xmm0
    395 ; SSE2-NEXT:    retq
    396 ;
    397 ; SSE41-LABEL: var_shift_v16i8:
    398 ; SSE41:       # %bb.0:
    399 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    400 ; SSE41-NEXT:    psllw $5, %xmm1
    401 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
    402 ; SSE41-NEXT:    psrlw $4, %xmm3
    403 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
    404 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    405 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
    406 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
    407 ; SSE41-NEXT:    psrlw $2, %xmm3
    408 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
    409 ; SSE41-NEXT:    paddb %xmm1, %xmm1
    410 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    411 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
    412 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
    413 ; SSE41-NEXT:    psrlw $1, %xmm3
    414 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
    415 ; SSE41-NEXT:    paddb %xmm1, %xmm1
    416 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    417 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
    418 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
    419 ; SSE41-NEXT:    retq
    420 ;
    421 ; AVX-LABEL: var_shift_v16i8:
    422 ; AVX:       # %bb.0:
    423 ; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
    424 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
    425 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    426 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    427 ; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
    428 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    429 ; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    430 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    431 ; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
    432 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    433 ; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    434 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    435 ; AVX-NEXT:    retq
    436 ;
    437 ; XOP-LABEL: var_shift_v16i8:
    438 ; XOP:       # %bb.0:
    439 ; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    440 ; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
    441 ; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
    442 ; XOP-NEXT:    retq
    443 ;
    444 ; AVX512DQ-LABEL: var_shift_v16i8:
    445 ; AVX512DQ:       # %bb.0:
    446 ; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
    447 ; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    448 ; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
    449 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
    450 ; AVX512DQ-NEXT:    vzeroupper
    451 ; AVX512DQ-NEXT:    retq
    452 ;
    453 ; AVX512BW-LABEL: var_shift_v16i8:
    454 ; AVX512BW:       # %bb.0:
    455 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    456 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    457 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
    458 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
    459 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    460 ; AVX512BW-NEXT:    vzeroupper
    461 ; AVX512BW-NEXT:    retq
    462 ;
    463 ; AVX512DQVL-LABEL: var_shift_v16i8:
    464 ; AVX512DQVL:       # %bb.0:
    465 ; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
    466 ; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    467 ; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
    468 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
    469 ; AVX512DQVL-NEXT:    vzeroupper
    470 ; AVX512DQVL-NEXT:    retq
    471 ;
    472 ; AVX512BWVL-LABEL: var_shift_v16i8:
    473 ; AVX512BWVL:       # %bb.0:
    474 ; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    475 ; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    476 ; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
    477 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
    478 ; AVX512BWVL-NEXT:    vzeroupper
    479 ; AVX512BWVL-NEXT:    retq
    480 ;
    481 ; X32-SSE-LABEL: var_shift_v16i8:
    482 ; X32-SSE:       # %bb.0:
    483 ; X32-SSE-NEXT:    psllw $5, %xmm1
    484 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
    485 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
    486 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
    487 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
    488 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
    489 ; X32-SSE-NEXT:    psrlw $4, %xmm0
    490 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
    491 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
    492 ; X32-SSE-NEXT:    por %xmm4, %xmm0
    493 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
    494 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
    495 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
    496 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
    497 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
    498 ; X32-SSE-NEXT:    psrlw $2, %xmm0
    499 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
    500 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
    501 ; X32-SSE-NEXT:    por %xmm4, %xmm0
    502 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
    503 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
    504 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
    505 ; X32-SSE-NEXT:    pandn %xmm0, %xmm1
    506 ; X32-SSE-NEXT:    psrlw $1, %xmm0
    507 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
    508 ; X32-SSE-NEXT:    pand %xmm2, %xmm0
    509 ; X32-SSE-NEXT:    por %xmm1, %xmm0
    510 ; X32-SSE-NEXT:    retl
    511   %shift = lshr <16 x i8> %a, %b
    512   ret <16 x i8> %shift
    513 }
    514 
    515 ;
    516 ; Uniform Variable Shifts
    517 ;
    518 
    519 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
    520 ; SSE-LABEL: splatvar_shift_v2i64:
    521 ; SSE:       # %bb.0:
    522 ; SSE-NEXT:    psrlq %xmm1, %xmm0
    523 ; SSE-NEXT:    retq
    524 ;
    525 ; AVX-LABEL: splatvar_shift_v2i64:
    526 ; AVX:       # %bb.0:
    527 ; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
    528 ; AVX-NEXT:    retq
    529 ;
    530 ; XOP-LABEL: splatvar_shift_v2i64:
    531 ; XOP:       # %bb.0:
    532 ; XOP-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
    533 ; XOP-NEXT:    retq
    534 ;
    535 ; AVX512-LABEL: splatvar_shift_v2i64:
    536 ; AVX512:       # %bb.0:
    537 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
    538 ; AVX512-NEXT:    retq
    539 ;
    540 ; AVX512VL-LABEL: splatvar_shift_v2i64:
    541 ; AVX512VL:       # %bb.0:
    542 ; AVX512VL-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
    543 ; AVX512VL-NEXT:    retq
    544 ;
    545 ; X32-SSE-LABEL: splatvar_shift_v2i64:
    546 ; X32-SSE:       # %bb.0:
    547 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
    548 ; X32-SSE-NEXT:    retl
    549   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
    550   %shift = lshr <2 x i64> %a, %splat
    551   ret <2 x i64> %shift
    552 }
    553 
    554 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
    555 ; SSE2-LABEL: splatvar_shift_v4i32:
    556 ; SSE2:       # %bb.0:
    557 ; SSE2-NEXT:    xorps %xmm2, %xmm2
    558 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
    559 ; SSE2-NEXT:    psrld %xmm2, %xmm0
    560 ; SSE2-NEXT:    retq
    561 ;
    562 ; SSE41-LABEL: splatvar_shift_v4i32:
    563 ; SSE41:       # %bb.0:
    564 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    565 ; SSE41-NEXT:    psrld %xmm1, %xmm0
    566 ; SSE41-NEXT:    retq
    567 ;
    568 ; AVX-LABEL: splatvar_shift_v4i32:
    569 ; AVX:       # %bb.0:
    570 ; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    571 ; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
    572 ; AVX-NEXT:    retq
    573 ;
    574 ; XOP-LABEL: splatvar_shift_v4i32:
    575 ; XOP:       # %bb.0:
    576 ; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    577 ; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
    578 ; XOP-NEXT:    retq
    579 ;
    580 ; AVX512-LABEL: splatvar_shift_v4i32:
    581 ; AVX512:       # %bb.0:
    582 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    583 ; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
    584 ; AVX512-NEXT:    retq
    585 ;
    586 ; AVX512VL-LABEL: splatvar_shift_v4i32:
    587 ; AVX512VL:       # %bb.0:
    588 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    589 ; AVX512VL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
    590 ; AVX512VL-NEXT:    retq
    591 ;
    592 ; X32-SSE-LABEL: splatvar_shift_v4i32:
    593 ; X32-SSE:       # %bb.0:
    594 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
    595 ; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
    596 ; X32-SSE-NEXT:    psrld %xmm2, %xmm0
    597 ; X32-SSE-NEXT:    retl
    598   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
    599   %shift = lshr <4 x i32> %a, %splat
    600   ret <4 x i32> %shift
    601 }
    602 
    603 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
    604 ; SSE2-LABEL: splatvar_shift_v8i16:
    605 ; SSE2:       # %bb.0:
    606 ; SSE2-NEXT:    pextrw $0, %xmm1, %eax
    607 ; SSE2-NEXT:    movd %eax, %xmm1
    608 ; SSE2-NEXT:    psrlw %xmm1, %xmm0
    609 ; SSE2-NEXT:    retq
    610 ;
    611 ; SSE41-LABEL: splatvar_shift_v8i16:
    612 ; SSE41:       # %bb.0:
    613 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    614 ; SSE41-NEXT:    psrlw %xmm1, %xmm0
    615 ; SSE41-NEXT:    retq
    616 ;
    617 ; AVX-LABEL: splatvar_shift_v8i16:
    618 ; AVX:       # %bb.0:
    619 ; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    620 ; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
    621 ; AVX-NEXT:    retq
    622 ;
    623 ; XOP-LABEL: splatvar_shift_v8i16:
    624 ; XOP:       # %bb.0:
    625 ; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    626 ; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
    627 ; XOP-NEXT:    retq
    628 ;
    629 ; AVX512-LABEL: splatvar_shift_v8i16:
    630 ; AVX512:       # %bb.0:
    631 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    632 ; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
    633 ; AVX512-NEXT:    retq
    634 ;
    635 ; AVX512VL-LABEL: splatvar_shift_v8i16:
    636 ; AVX512VL:       # %bb.0:
    637 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    638 ; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
    639 ; AVX512VL-NEXT:    retq
    640 ;
    641 ; X32-SSE-LABEL: splatvar_shift_v8i16:
    642 ; X32-SSE:       # %bb.0:
    643 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
    644 ; X32-SSE-NEXT:    movd %eax, %xmm1
    645 ; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
    646 ; X32-SSE-NEXT:    retl
    647   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
    648   %shift = lshr <8 x i16> %a, %splat
    649   ret <8 x i16> %shift
    650 }
    651 
    652 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
    653 ; SSE2-LABEL: splatvar_shift_v16i8:
    654 ; SSE2:       # %bb.0:
    655 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    656 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
    657 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
    658 ; SSE2-NEXT:    psllw $5, %xmm2
    659 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    660 ; SSE2-NEXT:    pxor %xmm3, %xmm3
    661 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
    662 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    663 ; SSE2-NEXT:    pandn %xmm0, %xmm4
    664 ; SSE2-NEXT:    psrlw $4, %xmm0
    665 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    666 ; SSE2-NEXT:    pand %xmm3, %xmm0
    667 ; SSE2-NEXT:    por %xmm4, %xmm0
    668 ; SSE2-NEXT:    paddb %xmm2, %xmm2
    669 ; SSE2-NEXT:    pxor %xmm3, %xmm3
    670 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
    671 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    672 ; SSE2-NEXT:    pandn %xmm0, %xmm4
    673 ; SSE2-NEXT:    psrlw $2, %xmm0
    674 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    675 ; SSE2-NEXT:    pand %xmm3, %xmm0
    676 ; SSE2-NEXT:    por %xmm4, %xmm0
    677 ; SSE2-NEXT:    paddb %xmm2, %xmm2
    678 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
    679 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    680 ; SSE2-NEXT:    pandn %xmm0, %xmm2
    681 ; SSE2-NEXT:    psrlw $1, %xmm0
    682 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    683 ; SSE2-NEXT:    pand %xmm1, %xmm0
    684 ; SSE2-NEXT:    por %xmm2, %xmm0
    685 ; SSE2-NEXT:    retq
    686 ;
    687 ; SSE41-LABEL: splatvar_shift_v16i8:
    688 ; SSE41:       # %bb.0:
    689 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    690 ; SSE41-NEXT:    pxor %xmm0, %xmm0
    691 ; SSE41-NEXT:    pshufb %xmm0, %xmm1
    692 ; SSE41-NEXT:    psllw $5, %xmm1
    693 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
    694 ; SSE41-NEXT:    paddb %xmm1, %xmm3
    695 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
    696 ; SSE41-NEXT:    psrlw $4, %xmm4
    697 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
    698 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    699 ; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
    700 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
    701 ; SSE41-NEXT:    psrlw $2, %xmm1
    702 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
    703 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    704 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
    705 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
    706 ; SSE41-NEXT:    psrlw $1, %xmm1
    707 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
    708 ; SSE41-NEXT:    paddb %xmm3, %xmm3
    709 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    710 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
    711 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
    712 ; SSE41-NEXT:    retq
    713 ;
    714 ; AVX1-LABEL: splatvar_shift_v16i8:
    715 ; AVX1:       # %bb.0:
    716 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    717 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    718 ; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
    719 ; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
    720 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
    721 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    722 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
    723 ; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
    724 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
    725 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    726 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
    727 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
    728 ; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
    729 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    730 ; AVX1-NEXT:    retq
    731 ;
    732 ; AVX2-LABEL: splatvar_shift_v16i8:
    733 ; AVX2:       # %bb.0:
    734 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
    735 ; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
    736 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm2
    737 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    738 ; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    739 ; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm2
    740 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    741 ; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    742 ; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    743 ; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm2
    744 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    745 ; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    746 ; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    747 ; AVX2-NEXT:    retq
    748 ;
    749 ; XOPAVX1-LABEL: splatvar_shift_v16i8:
    750 ; XOPAVX1:       # %bb.0:
    751 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    752 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    753 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
    754 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
    755 ; XOPAVX1-NEXT:    retq
    756 ;
    757 ; XOPAVX2-LABEL: splatvar_shift_v16i8:
    758 ; XOPAVX2:       # %bb.0:
    759 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
    760 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    761 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
    762 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
    763 ; XOPAVX2-NEXT:    retq
    764 ;
    765 ; AVX512DQ-LABEL: splatvar_shift_v16i8:
    766 ; AVX512DQ:       # %bb.0:
    767 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
    768 ; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    769 ; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
    770 ; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
    771 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
    772 ; AVX512DQ-NEXT:    vzeroupper
    773 ; AVX512DQ-NEXT:    retq
    774 ;
    775 ; AVX512BW-LABEL: splatvar_shift_v16i8:
    776 ; AVX512BW:       # %bb.0:
    777 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
    778 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    779 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    780 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
    781 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
    782 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    783 ; AVX512BW-NEXT:    vzeroupper
    784 ; AVX512BW-NEXT:    retq
    785 ;
    786 ; AVX512DQVL-LABEL: splatvar_shift_v16i8:
    787 ; AVX512DQVL:       # %bb.0:
    788 ; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
    789 ; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    790 ; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
    791 ; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
    792 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
    793 ; AVX512DQVL-NEXT:    vzeroupper
    794 ; AVX512DQVL-NEXT:    retq
    795 ;
    796 ; AVX512BWVL-LABEL: splatvar_shift_v16i8:
    797 ; AVX512BWVL:       # %bb.0:
    798 ; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
    799 ; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    800 ; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    801 ; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
    802 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
    803 ; AVX512BWVL-NEXT:    vzeroupper
    804 ; AVX512BWVL-NEXT:    retq
    805 ;
    806 ; X32-SSE-LABEL: splatvar_shift_v16i8:
    807 ; X32-SSE:       # %bb.0:
    808 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    809 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
    810 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
    811 ; X32-SSE-NEXT:    psllw $5, %xmm2
    812 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
    813 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
    814 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
    815 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
    816 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
    817 ; X32-SSE-NEXT:    psrlw $4, %xmm0
    818 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
    819 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
    820 ; X32-SSE-NEXT:    por %xmm4, %xmm0
    821 ; X32-SSE-NEXT:    paddb %xmm2, %xmm2
    822 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
    823 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
    824 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
    825 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
    826 ; X32-SSE-NEXT:    psrlw $2, %xmm0
    827 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
    828 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
    829 ; X32-SSE-NEXT:    por %xmm4, %xmm0
    830 ; X32-SSE-NEXT:    paddb %xmm2, %xmm2
    831 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
    832 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
    833 ; X32-SSE-NEXT:    pandn %xmm0, %xmm2
    834 ; X32-SSE-NEXT:    psrlw $1, %xmm0
    835 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
    836 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
    837 ; X32-SSE-NEXT:    por %xmm2, %xmm0
    838 ; X32-SSE-NEXT:    retl
    839   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
    840   %shift = lshr <16 x i8> %a, %splat
    841   ret <16 x i8> %shift
    842 }
    843 
    844 ;
    845 ; Constant Shifts
    846 ;
    847 
    848 define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
    849 ; SSE2-LABEL: constant_shift_v2i64:
    850 ; SSE2:       # %bb.0:
    851 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    852 ; SSE2-NEXT:    psrlq $1, %xmm1
    853 ; SSE2-NEXT:    psrlq $7, %xmm0
    854 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
    855 ; SSE2-NEXT:    retq
    856 ;
    857 ; SSE41-LABEL: constant_shift_v2i64:
    858 ; SSE41:       # %bb.0:
    859 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    860 ; SSE41-NEXT:    psrlq $7, %xmm1
    861 ; SSE41-NEXT:    psrlq $1, %xmm0
    862 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    863 ; SSE41-NEXT:    retq
    864 ;
    865 ; AVX1-LABEL: constant_shift_v2i64:
    866 ; AVX1:       # %bb.0:
    867 ; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
    868 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
    869 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    870 ; AVX1-NEXT:    retq
    871 ;
    872 ; AVX2-LABEL: constant_shift_v2i64:
    873 ; AVX2:       # %bb.0:
    874 ; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
    875 ; AVX2-NEXT:    retq
    876 ;
    877 ; XOPAVX1-LABEL: constant_shift_v2i64:
    878 ; XOPAVX1:       # %bb.0:
    879 ; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
    880 ; XOPAVX1-NEXT:    retq
    881 ;
    882 ; XOPAVX2-LABEL: constant_shift_v2i64:
    883 ; XOPAVX2:       # %bb.0:
    884 ; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
    885 ; XOPAVX2-NEXT:    retq
    886 ;
    887 ; AVX512-LABEL: constant_shift_v2i64:
    888 ; AVX512:       # %bb.0:
    889 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
    890 ; AVX512-NEXT:    retq
    891 ;
    892 ; AVX512VL-LABEL: constant_shift_v2i64:
    893 ; AVX512VL:       # %bb.0:
    894 ; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
    895 ; AVX512VL-NEXT:    retq
    896 ;
    897 ; X32-SSE-LABEL: constant_shift_v2i64:
    898 ; X32-SSE:       # %bb.0:
    899 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
    900 ; X32-SSE-NEXT:    psrlq $1, %xmm1
    901 ; X32-SSE-NEXT:    psrlq $7, %xmm0
    902 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
    903 ; X32-SSE-NEXT:    retl
    904   %shift = lshr <2 x i64> %a, <i64 1, i64 7>
    905   ret <2 x i64> %shift
    906 }
    907 
    908 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
    909 ; SSE2-LABEL: constant_shift_v4i32:
    910 ; SSE2:       # %bb.0:
    911 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    912 ; SSE2-NEXT:    psrld $7, %xmm1
    913 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    914 ; SSE2-NEXT:    psrld $6, %xmm2
    915 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
    916 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    917 ; SSE2-NEXT:    psrld $5, %xmm1
    918 ; SSE2-NEXT:    psrld $4, %xmm0
    919 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    920 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
    921 ; SSE2-NEXT:    retq
    922 ;
    923 ; SSE41-LABEL: constant_shift_v4i32:
    924 ; SSE41:       # %bb.0:
    925 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    926 ; SSE41-NEXT:    psrld $7, %xmm1
    927 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    928 ; SSE41-NEXT:    psrld $5, %xmm2
    929 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    930 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    931 ; SSE41-NEXT:    psrld $6, %xmm1
    932 ; SSE41-NEXT:    psrld $4, %xmm0
    933 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    934 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    935 ; SSE41-NEXT:    retq
    936 ;
    937 ; AVX1-LABEL: constant_shift_v4i32:
    938 ; AVX1:       # %bb.0:
    939 ; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
    940 ; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
    941 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    942 ; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
    943 ; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm0
    944 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
    945 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    946 ; AVX1-NEXT:    retq
    947 ;
    948 ; AVX2-LABEL: constant_shift_v4i32:
    949 ; AVX2:       # %bb.0:
    950 ; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    951 ; AVX2-NEXT:    retq
    952 ;
    953 ; XOPAVX1-LABEL: constant_shift_v4i32:
    954 ; XOPAVX1:       # %bb.0:
    955 ; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
    956 ; XOPAVX1-NEXT:    retq
    957 ;
    958 ; XOPAVX2-LABEL: constant_shift_v4i32:
    959 ; XOPAVX2:       # %bb.0:
    960 ; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    961 ; XOPAVX2-NEXT:    retq
    962 ;
    963 ; AVX512-LABEL: constant_shift_v4i32:
    964 ; AVX512:       # %bb.0:
    965 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    966 ; AVX512-NEXT:    retq
    967 ;
    968 ; AVX512VL-LABEL: constant_shift_v4i32:
    969 ; AVX512VL:       # %bb.0:
    970 ; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    971 ; AVX512VL-NEXT:    retq
    972 ;
    973 ; X32-SSE-LABEL: constant_shift_v4i32:
    974 ; X32-SSE:       # %bb.0:
    975 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
    976 ; X32-SSE-NEXT:    psrld $7, %xmm1
    977 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
    978 ; X32-SSE-NEXT:    psrld $6, %xmm2
    979 ; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
    980 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
    981 ; X32-SSE-NEXT:    psrld $5, %xmm1
    982 ; X32-SSE-NEXT:    psrld $4, %xmm0
    983 ; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    984 ; X32-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
    985 ; X32-SSE-NEXT:    retl
    986   %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
    987   ret <4 x i32> %shift
    988 }
    989 
    990 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
    991 ; SSE2-LABEL: constant_shift_v8i16:
    992 ; SSE2:       # %bb.0:
    993 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    994 ; SSE2-NEXT:    psrlw $4, %xmm1
    995 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
    996 ; SSE2-NEXT:    movapd %xmm1, %xmm2
    997 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
    998 ; SSE2-NEXT:    psrlw $2, %xmm1
    999 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
   1000 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
   1001 ; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
   1002 ; SSE2-NEXT:    movaps %xmm2, %xmm0
   1003 ; SSE2-NEXT:    andps %xmm1, %xmm0
   1004 ; SSE2-NEXT:    psrlw $1, %xmm2
   1005 ; SSE2-NEXT:    andnps %xmm2, %xmm1
   1006 ; SSE2-NEXT:    orps %xmm1, %xmm0
   1007 ; SSE2-NEXT:    retq
   1008 ;
   1009 ; SSE41-LABEL: constant_shift_v8i16:
   1010 ; SSE41:       # %bb.0:
   1011 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
   1012 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
   1013 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
   1014 ; SSE41-NEXT:    retq
   1015 ;
   1016 ; AVX-LABEL: constant_shift_v8i16:
   1017 ; AVX:       # %bb.0:
   1018 ; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
   1019 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
   1020 ; AVX-NEXT:    retq
   1021 ;
   1022 ; XOP-LABEL: constant_shift_v8i16:
   1023 ; XOP:       # %bb.0:
   1024 ; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
   1025 ; XOP-NEXT:    retq
   1026 ;
   1027 ; AVX512DQ-LABEL: constant_shift_v8i16:
   1028 ; AVX512DQ:       # %bb.0:
   1029 ; AVX512DQ-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
   1030 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
   1031 ; AVX512DQ-NEXT:    retq
   1032 ;
   1033 ; AVX512BW-LABEL: constant_shift_v8i16:
   1034 ; AVX512BW:       # %bb.0:
   1035 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1036 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
   1037 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
   1038 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1039 ; AVX512BW-NEXT:    vzeroupper
   1040 ; AVX512BW-NEXT:    retq
   1041 ;
   1042 ; AVX512DQVL-LABEL: constant_shift_v8i16:
   1043 ; AVX512DQVL:       # %bb.0:
   1044 ; AVX512DQVL-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
   1045 ; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
   1046 ; AVX512DQVL-NEXT:    retq
   1047 ;
   1048 ; AVX512BWVL-LABEL: constant_shift_v8i16:
   1049 ; AVX512BWVL:       # %bb.0:
   1050 ; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
   1051 ; AVX512BWVL-NEXT:    retq
   1052 ;
   1053 ; X32-SSE-LABEL: constant_shift_v8i16:
   1054 ; X32-SSE:       # %bb.0:
   1055 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   1056 ; X32-SSE-NEXT:    psrlw $4, %xmm1
   1057 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
   1058 ; X32-SSE-NEXT:    movapd %xmm1, %xmm2
   1059 ; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
   1060 ; X32-SSE-NEXT:    psrlw $2, %xmm1
   1061 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
   1062 ; X32-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
   1063 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
   1064 ; X32-SSE-NEXT:    movaps %xmm2, %xmm0
   1065 ; X32-SSE-NEXT:    andps %xmm1, %xmm0
   1066 ; X32-SSE-NEXT:    psrlw $1, %xmm2
   1067 ; X32-SSE-NEXT:    andnps %xmm2, %xmm1
   1068 ; X32-SSE-NEXT:    orps %xmm1, %xmm0
   1069 ; X32-SSE-NEXT:    retl
   1070   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   1071   ret <8 x i16> %shift
   1072 }
   1073 
   1074 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
   1075 ; SSE2-LABEL: constant_shift_v16i8:
   1076 ; SSE2:       # %bb.0:
   1077 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
   1078 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1079 ; SSE2-NEXT:    pxor %xmm3, %xmm3
   1080 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
   1081 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   1082 ; SSE2-NEXT:    pandn %xmm0, %xmm4
   1083 ; SSE2-NEXT:    psrlw $4, %xmm0
   1084 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1085 ; SSE2-NEXT:    pand %xmm3, %xmm0
   1086 ; SSE2-NEXT:    por %xmm4, %xmm0
   1087 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1088 ; SSE2-NEXT:    pxor %xmm3, %xmm3
   1089 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
   1090 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   1091 ; SSE2-NEXT:    pandn %xmm0, %xmm4
   1092 ; SSE2-NEXT:    psrlw $2, %xmm0
   1093 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1094 ; SSE2-NEXT:    pand %xmm3, %xmm0
   1095 ; SSE2-NEXT:    por %xmm4, %xmm0
   1096 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1097 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
   1098 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   1099 ; SSE2-NEXT:    pandn %xmm0, %xmm2
   1100 ; SSE2-NEXT:    psrlw $1, %xmm0
   1101 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1102 ; SSE2-NEXT:    pand %xmm1, %xmm0
   1103 ; SSE2-NEXT:    por %xmm2, %xmm0
   1104 ; SSE2-NEXT:    retq
   1105 ;
   1106 ; SSE41-LABEL: constant_shift_v16i8:
   1107 ; SSE41:       # %bb.0:
   1108 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
   1109 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
   1110 ; SSE41-NEXT:    psrlw $4, %xmm2
   1111 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
   1112 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,49376,32928,16480,32]
   1113 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
   1114 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
   1115 ; SSE41-NEXT:    psrlw $2, %xmm2
   1116 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
   1117 ; SSE41-NEXT:    paddb %xmm0, %xmm0
   1118 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
   1119 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
   1120 ; SSE41-NEXT:    psrlw $1, %xmm2
   1121 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
   1122 ; SSE41-NEXT:    paddb %xmm0, %xmm0
   1123 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
   1124 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
   1125 ; SSE41-NEXT:    retq
   1126 ;
   1127 ; AVX-LABEL: constant_shift_v16i8:
   1128 ; AVX:       # %bb.0:
   1129 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
   1130 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1131 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
   1132 ; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1133 ; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm1
   1134 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1135 ; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
   1136 ; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1137 ; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm1
   1138 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1139 ; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
   1140 ; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1141 ; AVX-NEXT:    retq
   1142 ;
   1143 ; XOP-LABEL: constant_shift_v16i8:
   1144 ; XOP:       # %bb.0:
   1145 ; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
   1146 ; XOP-NEXT:    retq
   1147 ;
   1148 ; AVX512DQ-LABEL: constant_shift_v16i8:
   1149 ; AVX512DQ:       # %bb.0:
   1150 ; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
   1151 ; AVX512DQ-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
   1152 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   1153 ; AVX512DQ-NEXT:    vzeroupper
   1154 ; AVX512DQ-NEXT:    retq
   1155 ;
   1156 ; AVX512BW-LABEL: constant_shift_v16i8:
   1157 ; AVX512BW:       # %bb.0:
   1158 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
   1159 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
   1160 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
   1161 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1162 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1163 ; AVX512BW-NEXT:    vzeroupper
   1164 ; AVX512BW-NEXT:    retq
   1165 ;
   1166 ; AVX512DQVL-LABEL: constant_shift_v16i8:
   1167 ; AVX512DQVL:       # %bb.0:
   1168 ; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
   1169 ; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
   1170 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   1171 ; AVX512DQVL-NEXT:    vzeroupper
   1172 ; AVX512DQVL-NEXT:    retq
   1173 ;
   1174 ; AVX512BWVL-LABEL: constant_shift_v16i8:
   1175 ; AVX512BWVL:       # %bb.0:
   1176 ; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
   1177 ; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
   1178 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
   1179 ; AVX512BWVL-NEXT:    vzeroupper
   1180 ; AVX512BWVL-NEXT:    retq
   1181 ;
   1182 ; X32-SSE-LABEL: constant_shift_v16i8:
   1183 ; X32-SSE:       # %bb.0:
   1184 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
   1185 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
   1186 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
   1187 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
   1188 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
   1189 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
   1190 ; X32-SSE-NEXT:    psrlw $4, %xmm0
   1191 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   1192 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
   1193 ; X32-SSE-NEXT:    por %xmm4, %xmm0
   1194 ; X32-SSE-NEXT:    paddb %xmm2, %xmm2
   1195 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
   1196 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
   1197 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
   1198 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
   1199 ; X32-SSE-NEXT:    psrlw $2, %xmm0
   1200 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   1201 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
   1202 ; X32-SSE-NEXT:    por %xmm4, %xmm0
   1203 ; X32-SSE-NEXT:    paddb %xmm2, %xmm2
   1204 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
   1205 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
   1206 ; X32-SSE-NEXT:    pandn %xmm0, %xmm2
   1207 ; X32-SSE-NEXT:    psrlw $1, %xmm0
   1208 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   1209 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
   1210 ; X32-SSE-NEXT:    por %xmm2, %xmm0
   1211 ; X32-SSE-NEXT:    retl
   1212   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   1213   ret <16 x i8> %shift
   1214 }
   1215 
   1216 ;
   1217 ; Uniform Constant Shifts
   1218 ;
   1219 
   1220 define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
   1221 ; SSE-LABEL: splatconstant_shift_v2i64:
   1222 ; SSE:       # %bb.0:
   1223 ; SSE-NEXT:    psrlq $7, %xmm0
   1224 ; SSE-NEXT:    retq
   1225 ;
   1226 ; AVX-LABEL: splatconstant_shift_v2i64:
   1227 ; AVX:       # %bb.0:
   1228 ; AVX-NEXT:    vpsrlq $7, %xmm0, %xmm0
   1229 ; AVX-NEXT:    retq
   1230 ;
   1231 ; XOP-LABEL: splatconstant_shift_v2i64:
   1232 ; XOP:       # %bb.0:
   1233 ; XOP-NEXT:    vpsrlq $7, %xmm0, %xmm0
   1234 ; XOP-NEXT:    retq
   1235 ;
   1236 ; AVX512-LABEL: splatconstant_shift_v2i64:
   1237 ; AVX512:       # %bb.0:
   1238 ; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
   1239 ; AVX512-NEXT:    retq
   1240 ;
   1241 ; AVX512VL-LABEL: splatconstant_shift_v2i64:
   1242 ; AVX512VL:       # %bb.0:
   1243 ; AVX512VL-NEXT:    vpsrlq $7, %xmm0, %xmm0
   1244 ; AVX512VL-NEXT:    retq
   1245 ;
   1246 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
   1247 ; X32-SSE:       # %bb.0:
   1248 ; X32-SSE-NEXT:    psrlq $7, %xmm0
   1249 ; X32-SSE-NEXT:    retl
   1250   %shift = lshr <2 x i64> %a, <i64 7, i64 7>
   1251   ret <2 x i64> %shift
   1252 }
   1253 
   1254 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
   1255 ; SSE-LABEL: splatconstant_shift_v4i32:
   1256 ; SSE:       # %bb.0:
   1257 ; SSE-NEXT:    psrld $5, %xmm0
   1258 ; SSE-NEXT:    retq
   1259 ;
   1260 ; AVX-LABEL: splatconstant_shift_v4i32:
   1261 ; AVX:       # %bb.0:
   1262 ; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
   1263 ; AVX-NEXT:    retq
   1264 ;
   1265 ; XOP-LABEL: splatconstant_shift_v4i32:
   1266 ; XOP:       # %bb.0:
   1267 ; XOP-NEXT:    vpsrld $5, %xmm0, %xmm0
   1268 ; XOP-NEXT:    retq
   1269 ;
   1270 ; AVX512-LABEL: splatconstant_shift_v4i32:
   1271 ; AVX512:       # %bb.0:
   1272 ; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
   1273 ; AVX512-NEXT:    retq
   1274 ;
   1275 ; AVX512VL-LABEL: splatconstant_shift_v4i32:
   1276 ; AVX512VL:       # %bb.0:
   1277 ; AVX512VL-NEXT:    vpsrld $5, %xmm0, %xmm0
   1278 ; AVX512VL-NEXT:    retq
   1279 ;
   1280 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
   1281 ; X32-SSE:       # %bb.0:
   1282 ; X32-SSE-NEXT:    psrld $5, %xmm0
   1283 ; X32-SSE-NEXT:    retl
   1284   %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
   1285   ret <4 x i32> %shift
   1286 }
   1287 
   1288 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
   1289 ; SSE-LABEL: splatconstant_shift_v8i16:
   1290 ; SSE:       # %bb.0:
   1291 ; SSE-NEXT:    psrlw $3, %xmm0
   1292 ; SSE-NEXT:    retq
   1293 ;
   1294 ; AVX-LABEL: splatconstant_shift_v8i16:
   1295 ; AVX:       # %bb.0:
   1296 ; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
   1297 ; AVX-NEXT:    retq
   1298 ;
   1299 ; XOP-LABEL: splatconstant_shift_v8i16:
   1300 ; XOP:       # %bb.0:
   1301 ; XOP-NEXT:    vpsrlw $3, %xmm0, %xmm0
   1302 ; XOP-NEXT:    retq
   1303 ;
   1304 ; AVX512-LABEL: splatconstant_shift_v8i16:
   1305 ; AVX512:       # %bb.0:
   1306 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
   1307 ; AVX512-NEXT:    retq
   1308 ;
   1309 ; AVX512VL-LABEL: splatconstant_shift_v8i16:
   1310 ; AVX512VL:       # %bb.0:
   1311 ; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
   1312 ; AVX512VL-NEXT:    retq
   1313 ;
   1314 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
   1315 ; X32-SSE:       # %bb.0:
   1316 ; X32-SSE-NEXT:    psrlw $3, %xmm0
   1317 ; X32-SSE-NEXT:    retl
   1318   %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   1319   ret <8 x i16> %shift
   1320 }
   1321 
   1322 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
   1323 ; SSE-LABEL: splatconstant_shift_v16i8:
   1324 ; SSE:       # %bb.0:
   1325 ; SSE-NEXT:    psrlw $3, %xmm0
   1326 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   1327 ; SSE-NEXT:    retq
   1328 ;
   1329 ; AVX-LABEL: splatconstant_shift_v16i8:
   1330 ; AVX:       # %bb.0:
   1331 ; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
   1332 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   1333 ; AVX-NEXT:    retq
   1334 ;
   1335 ; XOP-LABEL: splatconstant_shift_v16i8:
   1336 ; XOP:       # %bb.0:
   1337 ; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
   1338 ; XOP-NEXT:    retq
   1339 ;
   1340 ; AVX512-LABEL: splatconstant_shift_v16i8:
   1341 ; AVX512:       # %bb.0:
   1342 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
   1343 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   1344 ; AVX512-NEXT:    retq
   1345 ;
   1346 ; AVX512VL-LABEL: splatconstant_shift_v16i8:
   1347 ; AVX512VL:       # %bb.0:
   1348 ; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
   1349 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   1350 ; AVX512VL-NEXT:    retq
   1351 ;
   1352 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
   1353 ; X32-SSE:       # %bb.0:
   1354 ; X32-SSE-NEXT:    psrlw $3, %xmm0
   1355 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   1356 ; X32-SSE-NEXT:    retl
   1357   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   1358   ret <16 x i8> %shift
   1359 }
   1360