Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
     11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
     12 
     13 ; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
     14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
     15 
     16 ;
     17 ; Variable Rotates
     18 ;
     19 
     20 define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
     21 ; SSE2-LABEL: var_rotate_v2i64:
     22 ; SSE2:       # %bb.0:
     23 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
     24 ; SSE2-NEXT:    psubq %xmm1, %xmm2
     25 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
     26 ; SSE2-NEXT:    psllq %xmm1, %xmm3
     27 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     28 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
     29 ; SSE2-NEXT:    psllq %xmm1, %xmm4
     30 ; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
     31 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
     32 ; SSE2-NEXT:    psrlq %xmm2, %xmm1
     33 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
     34 ; SSE2-NEXT:    psrlq %xmm2, %xmm0
     35 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
     36 ; SSE2-NEXT:    orpd %xmm4, %xmm0
     37 ; SSE2-NEXT:    retq
     38 ;
     39 ; SSE41-LABEL: var_rotate_v2i64:
     40 ; SSE41:       # %bb.0:
     41 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
     42 ; SSE41-NEXT:    psubq %xmm1, %xmm2
     43 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
     44 ; SSE41-NEXT:    psllq %xmm1, %xmm3
     45 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     46 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
     47 ; SSE41-NEXT:    psllq %xmm1, %xmm4
     48 ; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
     49 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
     50 ; SSE41-NEXT:    psrlq %xmm2, %xmm1
     51 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
     52 ; SSE41-NEXT:    psrlq %xmm2, %xmm0
     53 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
     54 ; SSE41-NEXT:    por %xmm4, %xmm0
     55 ; SSE41-NEXT:    retq
     56 ;
     57 ; AVX1-LABEL: var_rotate_v2i64:
     58 ; AVX1:       # %bb.0:
     59 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
     60 ; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
     61 ; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
     62 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     63 ; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
     64 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
     65 ; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
     66 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
     67 ; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
     68 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
     69 ; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
     70 ; AVX1-NEXT:    retq
     71 ;
     72 ; AVX2-LABEL: var_rotate_v2i64:
     73 ; AVX2:       # %bb.0:
     74 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
     75 ; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
     76 ; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
     77 ; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
     78 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
     79 ; AVX2-NEXT:    retq
     80 ;
     81 ; AVX512F-LABEL: var_rotate_v2i64:
     82 ; AVX512F:       # %bb.0:
     83 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
     84 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
     85 ; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
     86 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
     87 ; AVX512F-NEXT:    vzeroupper
     88 ; AVX512F-NEXT:    retq
     89 ;
     90 ; AVX512VL-LABEL: var_rotate_v2i64:
     91 ; AVX512VL:       # %bb.0:
     92 ; AVX512VL-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
     93 ; AVX512VL-NEXT:    retq
     94 ;
     95 ; AVX512BW-LABEL: var_rotate_v2i64:
     96 ; AVX512BW:       # %bb.0:
     97 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
     98 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
     99 ; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
    100 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    101 ; AVX512BW-NEXT:    vzeroupper
    102 ; AVX512BW-NEXT:    retq
    103 ;
    104 ; AVX512VLBW-LABEL: var_rotate_v2i64:
    105 ; AVX512VLBW:       # %bb.0:
    106 ; AVX512VLBW-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
    107 ; AVX512VLBW-NEXT:    retq
    108 ;
    109 ; XOP-LABEL: var_rotate_v2i64:
    110 ; XOP:       # %bb.0:
    111 ; XOP-NEXT:    vprotq %xmm1, %xmm0, %xmm0
    112 ; XOP-NEXT:    retq
    113 ;
    114 ; X32-SSE-LABEL: var_rotate_v2i64:
    115 ; X32-SSE:       # %bb.0:
    116 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,64,0]
    117 ; X32-SSE-NEXT:    psubq %xmm1, %xmm2
    118 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
    119 ; X32-SSE-NEXT:    psllq %xmm1, %xmm3
    120 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    121 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
    122 ; X32-SSE-NEXT:    psllq %xmm1, %xmm4
    123 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
    124 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
    125 ; X32-SSE-NEXT:    psrlq %xmm2, %xmm1
    126 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
    127 ; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
    128 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
    129 ; X32-SSE-NEXT:    orpd %xmm4, %xmm0
    130 ; X32-SSE-NEXT:    retl
    131   %b64 = sub <2 x i64> <i64 64, i64 64>, %b
    132   %shl = shl <2 x i64> %a, %b
    133   %lshr = lshr <2 x i64> %a, %b64
    134   %or = or <2 x i64> %shl, %lshr
    135   ret <2 x i64> %or
    136 }
    137 
    138 define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
    139 ; SSE2-LABEL: var_rotate_v4i32:
    140 ; SSE2:       # %bb.0:
    141 ; SSE2-NEXT:    pslld $23, %xmm1
    142 ; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
    143 ; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
    144 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    145 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    146 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
    147 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    148 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    149 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
    150 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
    151 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    152 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    153 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    154 ; SSE2-NEXT:    por %xmm3, %xmm0
    155 ; SSE2-NEXT:    retq
    156 ;
    157 ; SSE41-LABEL: var_rotate_v4i32:
    158 ; SSE41:       # %bb.0:
    159 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    160 ; SSE41-NEXT:    pslld $23, %xmm1
    161 ; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
    162 ; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
    163 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
    164 ; SSE41-NEXT:    pmuludq %xmm2, %xmm3
    165 ; SSE41-NEXT:    pmuludq %xmm1, %xmm0
    166 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
    167 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
    168 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
    169 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    170 ; SSE41-NEXT:    por %xmm1, %xmm0
    171 ; SSE41-NEXT:    retq
    172 ;
    173 ; AVX1-LABEL: var_rotate_v4i32:
    174 ; AVX1:       # %bb.0:
    175 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    176 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
    177 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
    178 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
    179 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
    180 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
    181 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
    182 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
    183 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
    184 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
    185 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    186 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
    187 ; AVX1-NEXT:    retq
    188 ;
    189 ; AVX2-LABEL: var_rotate_v4i32:
    190 ; AVX2:       # %bb.0:
    191 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
    192 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
    193 ; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
    194 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    195 ; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
    196 ; AVX2-NEXT:    retq
    197 ;
    198 ; AVX512F-LABEL: var_rotate_v4i32:
    199 ; AVX512F:       # %bb.0:
    200 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
    201 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    202 ; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
    203 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    204 ; AVX512F-NEXT:    vzeroupper
    205 ; AVX512F-NEXT:    retq
    206 ;
    207 ; AVX512VL-LABEL: var_rotate_v4i32:
    208 ; AVX512VL:       # %bb.0:
    209 ; AVX512VL-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
    210 ; AVX512VL-NEXT:    retq
    211 ;
    212 ; AVX512BW-LABEL: var_rotate_v4i32:
    213 ; AVX512BW:       # %bb.0:
    214 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
    215 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    216 ; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
    217 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    218 ; AVX512BW-NEXT:    vzeroupper
    219 ; AVX512BW-NEXT:    retq
    220 ;
    221 ; AVX512VLBW-LABEL: var_rotate_v4i32:
    222 ; AVX512VLBW:       # %bb.0:
    223 ; AVX512VLBW-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
    224 ; AVX512VLBW-NEXT:    retq
    225 ;
    226 ; XOP-LABEL: var_rotate_v4i32:
    227 ; XOP:       # %bb.0:
    228 ; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
    229 ; XOP-NEXT:    retq
    230 ;
    231 ; X32-SSE-LABEL: var_rotate_v4i32:
    232 ; X32-SSE:       # %bb.0:
    233 ; X32-SSE-NEXT:    pslld $23, %xmm1
    234 ; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
    235 ; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
    236 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    237 ; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
    238 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
    239 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    240 ; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
    241 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
    242 ; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
    243 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    244 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    245 ; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    246 ; X32-SSE-NEXT:    por %xmm3, %xmm0
    247 ; X32-SSE-NEXT:    retl
    248   %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b
    249   %shl = shl <4 x i32> %a, %b
    250   %lshr = lshr <4 x i32> %a, %b32
    251   %or = or <4 x i32> %shl, %lshr
    252   ret <4 x i32> %or
    253 }
    254 
    255 define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
    256 ; SSE2-LABEL: var_rotate_v8i16:
    257 ; SSE2:       # %bb.0:
    258 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    259 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
    260 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    261 ; SSE2-NEXT:    pslld $23, %xmm3
    262 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
    263 ; SSE2-NEXT:    paddd %xmm4, %xmm3
    264 ; SSE2-NEXT:    cvttps2dq %xmm3, %xmm3
    265 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
    266 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
    267 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
    268 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    269 ; SSE2-NEXT:    pslld $23, %xmm1
    270 ; SSE2-NEXT:    paddd %xmm4, %xmm1
    271 ; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
    272 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
    273 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
    274 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    275 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
    276 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    277 ; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
    278 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
    279 ; SSE2-NEXT:    por %xmm2, %xmm0
    280 ; SSE2-NEXT:    retq
    281 ;
    282 ; SSE41-LABEL: var_rotate_v8i16:
    283 ; SSE41:       # %bb.0:
    284 ; SSE41-NEXT:    pxor %xmm2, %xmm2
    285 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    286 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    287 ; SSE41-NEXT:    pslld $23, %xmm1
    288 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
    289 ; SSE41-NEXT:    paddd %xmm2, %xmm1
    290 ; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
    291 ; SSE41-NEXT:    pslld $23, %xmm3
    292 ; SSE41-NEXT:    paddd %xmm2, %xmm3
    293 ; SSE41-NEXT:    cvttps2dq %xmm3, %xmm2
    294 ; SSE41-NEXT:    packusdw %xmm1, %xmm2
    295 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    296 ; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
    297 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
    298 ; SSE41-NEXT:    por %xmm1, %xmm0
    299 ; SSE41-NEXT:    retq
    300 ;
    301 ; AVX1-LABEL: var_rotate_v8i16:
    302 ; AVX1:       # %bb.0:
    303 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    304 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    305 ; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
    306 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
    307 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
    308 ; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
    309 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    310 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
    311 ; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
    312 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
    313 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    314 ; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
    315 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
    316 ; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
    317 ; AVX1-NEXT:    retq
    318 ;
    319 ; AVX2-LABEL: var_rotate_v8i16:
    320 ; AVX2:       # %bb.0:
    321 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    322 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    323 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
    324 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    325 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
    326 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
    327 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
    328 ; AVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
    329 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    330 ; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    331 ; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    332 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    333 ; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
    334 ; AVX2-NEXT:    vzeroupper
    335 ; AVX2-NEXT:    retq
    336 ;
    337 ; AVX512F-LABEL: var_rotate_v8i16:
    338 ; AVX512F:       # %bb.0:
    339 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    340 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    341 ; AVX512F-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
    342 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
    343 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    344 ; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    345 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    346 ; AVX512F-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    347 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    348 ; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
    349 ; AVX512F-NEXT:    vzeroupper
    350 ; AVX512F-NEXT:    retq
    351 ;
    352 ; AVX512VL-LABEL: var_rotate_v8i16:
    353 ; AVX512VL:       # %bb.0:
    354 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    355 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    356 ; AVX512VL-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
    357 ; AVX512VL-NEXT:    vpmovdw %ymm2, %xmm2
    358 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    359 ; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    360 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    361 ; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
    362 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
    363 ; AVX512VL-NEXT:    vpor %xmm0, %xmm2, %xmm0
    364 ; AVX512VL-NEXT:    vzeroupper
    365 ; AVX512VL-NEXT:    retq
    366 ;
    367 ; AVX512BW-LABEL: var_rotate_v8i16:
    368 ; AVX512BW:       # %bb.0:
    369 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
    370 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    371 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
    372 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    373 ; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    374 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
    375 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
    376 ; AVX512BW-NEXT:    vzeroupper
    377 ; AVX512BW-NEXT:    retq
    378 ;
    379 ; AVX512VLBW-LABEL: var_rotate_v8i16:
    380 ; AVX512VLBW:       # %bb.0:
    381 ; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2
    382 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    383 ; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    384 ; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
    385 ; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
    386 ; AVX512VLBW-NEXT:    retq
    387 ;
    388 ; XOP-LABEL: var_rotate_v8i16:
    389 ; XOP:       # %bb.0:
    390 ; XOP-NEXT:    vprotw %xmm1, %xmm0, %xmm0
    391 ; XOP-NEXT:    retq
    392 ;
    393 ; X32-SSE-LABEL: var_rotate_v8i16:
    394 ; X32-SSE:       # %bb.0:
    395 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
    396 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
    397 ; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    398 ; X32-SSE-NEXT:    pslld $23, %xmm3
    399 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
    400 ; X32-SSE-NEXT:    paddd %xmm4, %xmm3
    401 ; X32-SSE-NEXT:    cvttps2dq %xmm3, %xmm3
    402 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
    403 ; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
    404 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
    405 ; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    406 ; X32-SSE-NEXT:    pslld $23, %xmm1
    407 ; X32-SSE-NEXT:    paddd %xmm4, %xmm1
    408 ; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
    409 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
    410 ; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
    411 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    412 ; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
    413 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
    414 ; X32-SSE-NEXT:    pmulhuw %xmm1, %xmm2
    415 ; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
    416 ; X32-SSE-NEXT:    por %xmm2, %xmm0
    417 ; X32-SSE-NEXT:    retl
    418   %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
    419   %shl = shl <8 x i16> %a, %b
    420   %lshr = lshr <8 x i16> %a, %b16
    421   %or = or <8 x i16> %shl, %lshr
    422   ret <8 x i16> %or
    423 }
    424 
    425 define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
    426 ; SSE2-LABEL: var_rotate_v16i8:
    427 ; SSE2:       # %bb.0:
    428 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    429 ; SSE2-NEXT:    psllw $5, %xmm1
    430 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    431 ; SSE2-NEXT:    pxor %xmm3, %xmm3
    432 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
    433 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
    434 ; SSE2-NEXT:    psrlw $4, %xmm4
    435 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
    436 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
    437 ; SSE2-NEXT:    psllw $4, %xmm5
    438 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm5
    439 ; SSE2-NEXT:    por %xmm4, %xmm5
    440 ; SSE2-NEXT:    pand %xmm3, %xmm5
    441 ; SSE2-NEXT:    pandn %xmm2, %xmm3
    442 ; SSE2-NEXT:    por %xmm5, %xmm3
    443 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
    444 ; SSE2-NEXT:    psrlw $6, %xmm2
    445 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
    446 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    447 ; SSE2-NEXT:    psllw $2, %xmm4
    448 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
    449 ; SSE2-NEXT:    por %xmm2, %xmm4
    450 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    451 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    452 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
    453 ; SSE2-NEXT:    pand %xmm2, %xmm4
    454 ; SSE2-NEXT:    pandn %xmm3, %xmm2
    455 ; SSE2-NEXT:    por %xmm4, %xmm2
    456 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
    457 ; SSE2-NEXT:    paddb %xmm2, %xmm3
    458 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
    459 ; SSE2-NEXT:    psrlw $7, %xmm4
    460 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
    461 ; SSE2-NEXT:    por %xmm3, %xmm4
    462 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    463 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
    464 ; SSE2-NEXT:    pand %xmm0, %xmm4
    465 ; SSE2-NEXT:    pandn %xmm2, %xmm0
    466 ; SSE2-NEXT:    por %xmm4, %xmm0
    467 ; SSE2-NEXT:    retq
    468 ;
    469 ; SSE41-LABEL: var_rotate_v16i8:
    470 ; SSE41:       # %bb.0:
    471 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
    472 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
    473 ; SSE41-NEXT:    psrlw $4, %xmm0
    474 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
    475 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
    476 ; SSE41-NEXT:    psllw $4, %xmm3
    477 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
    478 ; SSE41-NEXT:    por %xmm0, %xmm3
    479 ; SSE41-NEXT:    psllw $5, %xmm2
    480 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
    481 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
    482 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    483 ; SSE41-NEXT:    psrlw $6, %xmm0
    484 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
    485 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
    486 ; SSE41-NEXT:    psllw $2, %xmm3
    487 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
    488 ; SSE41-NEXT:    por %xmm0, %xmm3
    489 ; SSE41-NEXT:    paddb %xmm2, %xmm2
    490 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
    491 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
    492 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    493 ; SSE41-NEXT:    paddb %xmm1, %xmm0
    494 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
    495 ; SSE41-NEXT:    psrlw $7, %xmm3
    496 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
    497 ; SSE41-NEXT:    por %xmm0, %xmm3
    498 ; SSE41-NEXT:    paddb %xmm2, %xmm2
    499 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
    500 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
    501 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    502 ; SSE41-NEXT:    retq
    503 ;
    504 ; AVX-LABEL: var_rotate_v16i8:
    505 ; AVX:       # %bb.0:
    506 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
    507 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    508 ; AVX-NEXT:    vpsllw $4, %xmm0, %xmm3
    509 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    510 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
    511 ; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
    512 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    513 ; AVX-NEXT:    vpsrlw $6, %xmm0, %xmm2
    514 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    515 ; AVX-NEXT:    vpsllw $2, %xmm0, %xmm3
    516 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    517 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
    518 ; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    519 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    520 ; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
    521 ; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm3
    522 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    523 ; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
    524 ; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    525 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    526 ; AVX-NEXT:    retq
    527 ;
    528 ; AVX512F-LABEL: var_rotate_v16i8:
    529 ; AVX512F:       # %bb.0:
    530 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm2
    531 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    532 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm3
    533 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    534 ; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
    535 ; AVX512F-NEXT:    vpsllw $5, %xmm1, %xmm1
    536 ; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    537 ; AVX512F-NEXT:    vpsrlw $6, %xmm0, %xmm2
    538 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    539 ; AVX512F-NEXT:    vpsllw $2, %xmm0, %xmm3
    540 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    541 ; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
    542 ; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    543 ; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    544 ; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
    545 ; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm3
    546 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    547 ; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
    548 ; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    549 ; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    550 ; AVX512F-NEXT:    retq
    551 ;
    552 ; AVX512VL-LABEL: var_rotate_v16i8:
    553 ; AVX512VL:       # %bb.0:
    554 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm2
    555 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    556 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm3
    557 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    558 ; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
    559 ; AVX512VL-NEXT:    vpsllw $5, %xmm1, %xmm1
    560 ; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    561 ; AVX512VL-NEXT:    vpsrlw $6, %xmm0, %xmm2
    562 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
    563 ; AVX512VL-NEXT:    vpsllw $2, %xmm0, %xmm3
    564 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    565 ; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
    566 ; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    567 ; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    568 ; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
    569 ; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm3
    570 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
    571 ; AVX512VL-NEXT:    vpor %xmm3, %xmm2, %xmm2
    572 ; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
    573 ; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
    574 ; AVX512VL-NEXT:    retq
    575 ;
    576 ; AVX512BW-LABEL: var_rotate_v16i8:
    577 ; AVX512BW:       # %bb.0:
    578 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    579 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    580 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
    581 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
    582 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
    583 ; AVX512BW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
    584 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    585 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
    586 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
    587 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
    588 ; AVX512BW-NEXT:    vzeroupper
    589 ; AVX512BW-NEXT:    retq
    590 ;
    591 ; AVX512VLBW-LABEL: var_rotate_v16i8:
    592 ; AVX512VLBW:       # %bb.0:
    593 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    594 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    595 ; AVX512VLBW-NEXT:    vpsllvw %ymm2, %ymm0, %ymm2
    596 ; AVX512VLBW-NEXT:    vpmovwb %ymm2, %xmm2
    597 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
    598 ; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
    599 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    600 ; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
    601 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
    602 ; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
    603 ; AVX512VLBW-NEXT:    vzeroupper
    604 ; AVX512VLBW-NEXT:    retq
    605 ;
    606 ; XOP-LABEL: var_rotate_v16i8:
    607 ; XOP:       # %bb.0:
    608 ; XOP-NEXT:    vprotb %xmm1, %xmm0, %xmm0
    609 ; XOP-NEXT:    retq
    610 ;
    611 ; X32-SSE-LABEL: var_rotate_v16i8:
    612 ; X32-SSE:       # %bb.0:
    613 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
    614 ; X32-SSE-NEXT:    psllw $5, %xmm1
    615 ; X32-SSE-NEXT:    pxor %xmm0, %xmm0
    616 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
    617 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
    618 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
    619 ; X32-SSE-NEXT:    psrlw $4, %xmm4
    620 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
    621 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
    622 ; X32-SSE-NEXT:    psllw $4, %xmm5
    623 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm5
    624 ; X32-SSE-NEXT:    por %xmm4, %xmm5
    625 ; X32-SSE-NEXT:    pand %xmm3, %xmm5
    626 ; X32-SSE-NEXT:    pandn %xmm2, %xmm3
    627 ; X32-SSE-NEXT:    por %xmm5, %xmm3
    628 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm2
    629 ; X32-SSE-NEXT:    psrlw $6, %xmm2
    630 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
    631 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
    632 ; X32-SSE-NEXT:    psllw $2, %xmm4
    633 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
    634 ; X32-SSE-NEXT:    por %xmm2, %xmm4
    635 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
    636 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
    637 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
    638 ; X32-SSE-NEXT:    pand %xmm2, %xmm4
    639 ; X32-SSE-NEXT:    pandn %xmm3, %xmm2
    640 ; X32-SSE-NEXT:    por %xmm4, %xmm2
    641 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
    642 ; X32-SSE-NEXT:    paddb %xmm2, %xmm3
    643 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
    644 ; X32-SSE-NEXT:    psrlw $7, %xmm4
    645 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
    646 ; X32-SSE-NEXT:    por %xmm3, %xmm4
    647 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
    648 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm0
    649 ; X32-SSE-NEXT:    pand %xmm0, %xmm4
    650 ; X32-SSE-NEXT:    pandn %xmm2, %xmm0
    651 ; X32-SSE-NEXT:    por %xmm4, %xmm0
    652 ; X32-SSE-NEXT:    retl
    653   %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
    654   %shl = shl <16 x i8> %a, %b
    655   %lshr = lshr <16 x i8> %a, %b8
    656   %or = or <16 x i8> %shl, %lshr
    657   ret <16 x i8> %or
    658 }
    659 
    660 ;
    661 ; Uniform Variable Rotates
    662 ;
    663 
    664 define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
    665 ; SSE-LABEL: splatvar_rotate_v2i64:
    666 ; SSE:       # %bb.0:
    667 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
    668 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,64]
    669 ; SSE-NEXT:    psubq %xmm2, %xmm3
    670 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    671 ; SSE-NEXT:    psllq %xmm1, %xmm2
    672 ; SSE-NEXT:    psrlq %xmm3, %xmm0
    673 ; SSE-NEXT:    por %xmm2, %xmm0
    674 ; SSE-NEXT:    retq
    675 ;
    676 ; AVX1-LABEL: splatvar_rotate_v2i64:
    677 ; AVX1:       # %bb.0:
    678 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
    679 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
    680 ; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
    681 ; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
    682 ; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
    683 ; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
    684 ; AVX1-NEXT:    retq
    685 ;
    686 ; AVX2-LABEL: splatvar_rotate_v2i64:
    687 ; AVX2:       # %bb.0:
    688 ; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm2
    689 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
    690 ; AVX2-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
    691 ; AVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
    692 ; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
    693 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
    694 ; AVX2-NEXT:    retq
    695 ;
    696 ; AVX512F-LABEL: splatvar_rotate_v2i64:
    697 ; AVX512F:       # %bb.0:
    698 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    699 ; AVX512F-NEXT:    vpbroadcastq %xmm1, %xmm1
    700 ; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
    701 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    702 ; AVX512F-NEXT:    vzeroupper
    703 ; AVX512F-NEXT:    retq
    704 ;
    705 ; AVX512VL-LABEL: splatvar_rotate_v2i64:
    706 ; AVX512VL:       # %bb.0:
    707 ; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
    708 ; AVX512VL-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
    709 ; AVX512VL-NEXT:    retq
    710 ;
    711 ; AVX512BW-LABEL: splatvar_rotate_v2i64:
    712 ; AVX512BW:       # %bb.0:
    713 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    714 ; AVX512BW-NEXT:    vpbroadcastq %xmm1, %xmm1
    715 ; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
    716 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    717 ; AVX512BW-NEXT:    vzeroupper
    718 ; AVX512BW-NEXT:    retq
    719 ;
    720 ; AVX512VLBW-LABEL: splatvar_rotate_v2i64:
    721 ; AVX512VLBW:       # %bb.0:
    722 ; AVX512VLBW-NEXT:    vpbroadcastq %xmm1, %xmm1
    723 ; AVX512VLBW-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
    724 ; AVX512VLBW-NEXT:    retq
    725 ;
    726 ; XOPAVX1-LABEL: splatvar_rotate_v2i64:
    727 ; XOPAVX1:       # %bb.0:
    728 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
    729 ; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
    730 ; XOPAVX1-NEXT:    retq
    731 ;
    732 ; XOPAVX2-LABEL: splatvar_rotate_v2i64:
    733 ; XOPAVX2:       # %bb.0:
    734 ; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
    735 ; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
    736 ; XOPAVX2-NEXT:    retq
    737 ;
    738 ; X32-SSE-LABEL: splatvar_rotate_v2i64:
    739 ; X32-SSE:       # %bb.0:
    740 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
    741 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
    742 ; X32-SSE-NEXT:    psubq %xmm2, %xmm3
    743 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
    744 ; X32-SSE-NEXT:    psllq %xmm1, %xmm2
    745 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
    746 ; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
    747 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
    748 ; X32-SSE-NEXT:    psrlq %xmm3, %xmm0
    749 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
    750 ; X32-SSE-NEXT:    orpd %xmm2, %xmm0
    751 ; X32-SSE-NEXT:    retl
    752   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
    753   %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat
    754   %shl = shl <2 x i64> %a, %splat
    755   %lshr = lshr <2 x i64> %a, %splat64
    756   %or = or <2 x i64> %shl, %lshr
    757   ret <2 x i64> %or
    758 }
    759 
    760 define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
    761 ; SSE2-LABEL: splatvar_rotate_v4i32:
    762 ; SSE2:       # %bb.0:
    763 ; SSE2-NEXT:    xorps %xmm2, %xmm2
    764 ; SSE2-NEXT:    xorps %xmm3, %xmm3
    765 ; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
    766 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    767 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
    768 ; SSE2-NEXT:    pslld %xmm3, %xmm4
    769 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32]
    770 ; SSE2-NEXT:    psubd %xmm1, %xmm3
    771 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
    772 ; SSE2-NEXT:    psrld %xmm2, %xmm0
    773 ; SSE2-NEXT:    por %xmm4, %xmm0
    774 ; SSE2-NEXT:    retq
    775 ;
    776 ; SSE41-LABEL: splatvar_rotate_v4i32:
    777 ; SSE41:       # %bb.0:
    778 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
    779 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    780 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
    781 ; SSE41-NEXT:    pslld %xmm2, %xmm3
    782 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
    783 ; SSE41-NEXT:    psubd %xmm1, %xmm2
    784 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero
    785 ; SSE41-NEXT:    psrld %xmm1, %xmm0
    786 ; SSE41-NEXT:    por %xmm3, %xmm0
    787 ; SSE41-NEXT:    retq
    788 ;
    789 ; AVX1-LABEL: splatvar_rotate_v4i32:
    790 ; AVX1:       # %bb.0:
    791 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
    792 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    793 ; AVX1-NEXT:    vpslld %xmm2, %xmm0, %xmm2
    794 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
    795 ; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
    796 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    797 ; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
    798 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
    799 ; AVX1-NEXT:    retq
    800 ;
    801 ; AVX2-LABEL: splatvar_rotate_v4i32:
    802 ; AVX2:       # %bb.0:
    803 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
    804 ; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
    805 ; AVX2-NEXT:    vpslld %xmm2, %xmm0, %xmm2
    806 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
    807 ; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
    808 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
    809 ; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
    810 ; AVX2-NEXT:    retq
    811 ;
    812 ; AVX512F-LABEL: splatvar_rotate_v4i32:
    813 ; AVX512F:       # %bb.0:
    814 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    815 ; AVX512F-NEXT:    vpbroadcastd %xmm1, %xmm1
    816 ; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
    817 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    818 ; AVX512F-NEXT:    vzeroupper
    819 ; AVX512F-NEXT:    retq
    820 ;
    821 ; AVX512VL-LABEL: splatvar_rotate_v4i32:
    822 ; AVX512VL:       # %bb.0:
    823 ; AVX512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
    824 ; AVX512VL-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
    825 ; AVX512VL-NEXT:    retq
    826 ;
    827 ; AVX512BW-LABEL: splatvar_rotate_v4i32:
    828 ; AVX512BW:       # %bb.0:
    829 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    830 ; AVX512BW-NEXT:    vpbroadcastd %xmm1, %xmm1
    831 ; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
    832 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    833 ; AVX512BW-NEXT:    vzeroupper
    834 ; AVX512BW-NEXT:    retq
    835 ;
    836 ; AVX512VLBW-LABEL: splatvar_rotate_v4i32:
    837 ; AVX512VLBW:       # %bb.0:
    838 ; AVX512VLBW-NEXT:    vpbroadcastd %xmm1, %xmm1
    839 ; AVX512VLBW-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
    840 ; AVX512VLBW-NEXT:    retq
    841 ;
    842 ; XOPAVX1-LABEL: splatvar_rotate_v4i32:
    843 ; XOPAVX1:       # %bb.0:
    844 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    845 ; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
    846 ; XOPAVX1-NEXT:    retq
    847 ;
    848 ; XOPAVX2-LABEL: splatvar_rotate_v4i32:
    849 ; XOPAVX2:       # %bb.0:
    850 ; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
    851 ; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
    852 ; XOPAVX2-NEXT:    retq
    853 ;
    854 ; X32-SSE-LABEL: splatvar_rotate_v4i32:
    855 ; X32-SSE:       # %bb.0:
    856 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
    857 ; X32-SSE-NEXT:    xorps %xmm3, %xmm3
    858 ; X32-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
    859 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    860 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
    861 ; X32-SSE-NEXT:    pslld %xmm3, %xmm4
    862 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32]
    863 ; X32-SSE-NEXT:    psubd %xmm1, %xmm3
    864 ; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
    865 ; X32-SSE-NEXT:    psrld %xmm2, %xmm0
    866 ; X32-SSE-NEXT:    por %xmm4, %xmm0
    867 ; X32-SSE-NEXT:    retl
    868   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
    869   %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat
    870   %shl = shl <4 x i32> %a, %splat
    871   %lshr = lshr <4 x i32> %a, %splat32
    872   %or = or <4 x i32> %shl, %lshr
    873   ret <4 x i32> %or
    874 }
    875 
    876 define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
    877 ; SSE2-LABEL: splatvar_rotate_v8i16:
    878 ; SSE2:       # %bb.0:
    879 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
    880 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
    881 ; SSE2-NEXT:    pextrw $0, %xmm1, %eax
    882 ; SSE2-NEXT:    movd %eax, %xmm1
    883 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    884 ; SSE2-NEXT:    psllw %xmm1, %xmm3
    885 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
    886 ; SSE2-NEXT:    psubw %xmm2, %xmm1
    887 ; SSE2-NEXT:    pextrw $0, %xmm1, %eax
    888 ; SSE2-NEXT:    movd %eax, %xmm1
    889 ; SSE2-NEXT:    psrlw %xmm1, %xmm0
    890 ; SSE2-NEXT:    por %xmm3, %xmm0
    891 ; SSE2-NEXT:    retq
    892 ;
    893 ; SSE41-LABEL: splatvar_rotate_v8i16:
    894 ; SSE41:       # %bb.0:
    895 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    896 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
    897 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    898 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
    899 ; SSE41-NEXT:    psllw %xmm2, %xmm3
    900 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
    901 ; SSE41-NEXT:    psubw %xmm1, %xmm2
    902 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
    903 ; SSE41-NEXT:    psrlw %xmm1, %xmm0
    904 ; SSE41-NEXT:    por %xmm3, %xmm0
    905 ; SSE41-NEXT:    retq
    906 ;
    907 ; AVX1-LABEL: splatvar_rotate_v8i16:
    908 ; AVX1:       # %bb.0:
    909 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    910 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
    911 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    912 ; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
    913 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    914 ; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    915 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    916 ; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
    917 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
    918 ; AVX1-NEXT:    retq
    919 ;
    920 ; AVX2-LABEL: splatvar_rotate_v8i16:
    921 ; AVX2:       # %bb.0:
    922 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    923 ; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
    924 ; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
    925 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    926 ; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    927 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    928 ; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
    929 ; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
    930 ; AVX2-NEXT:    retq
    931 ;
    932 ; AVX512F-LABEL: splatvar_rotate_v8i16:
    933 ; AVX512F:       # %bb.0:
    934 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    935 ; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
    936 ; AVX512F-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
    937 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    938 ; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    939 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    940 ; AVX512F-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
    941 ; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
    942 ; AVX512F-NEXT:    retq
    943 ;
    944 ; AVX512VL-LABEL: splatvar_rotate_v8i16:
    945 ; AVX512VL:       # %bb.0:
    946 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    947 ; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
    948 ; AVX512VL-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
    949 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    950 ; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    951 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    952 ; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
    953 ; AVX512VL-NEXT:    vpor %xmm0, %xmm2, %xmm0
    954 ; AVX512VL-NEXT:    retq
    955 ;
    956 ; AVX512BW-LABEL: splatvar_rotate_v8i16:
    957 ; AVX512BW:       # %bb.0:
    958 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    959 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    960 ; AVX512BW-NEXT:    vpbroadcastw %xmm1, %xmm1
    961 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    962 ; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    963 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
    964 ; AVX512BW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
    965 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
    966 ; AVX512BW-NEXT:    vzeroupper
    967 ; AVX512BW-NEXT:    retq
    968 ;
    969 ; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
    970 ; AVX512VLBW:       # %bb.0:
    971 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
    972 ; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %xmm1
    973 ; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
    974 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
    975 ; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
    976 ; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
    977 ; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
    978 ; AVX512VLBW-NEXT:    retq
    979 ;
    980 ; XOPAVX1-LABEL: splatvar_rotate_v8i16:
    981 ; XOPAVX1:       # %bb.0:
    982 ; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
    983 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
    984 ; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
    985 ; XOPAVX1-NEXT:    retq
    986 ;
    987 ; XOPAVX2-LABEL: splatvar_rotate_v8i16:
    988 ; XOPAVX2:       # %bb.0:
    989 ; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
    990 ; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
    991 ; XOPAVX2-NEXT:    retq
    992 ;
    993 ; X32-SSE-LABEL: splatvar_rotate_v8i16:
    994 ; X32-SSE:       # %bb.0:
    995 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
    996 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
    997 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
    998 ; X32-SSE-NEXT:    movd %eax, %xmm1
    999 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
   1000 ; X32-SSE-NEXT:    psllw %xmm1, %xmm3
   1001 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
   1002 ; X32-SSE-NEXT:    psubw %xmm2, %xmm1
   1003 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
   1004 ; X32-SSE-NEXT:    movd %eax, %xmm1
   1005 ; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
   1006 ; X32-SSE-NEXT:    por %xmm3, %xmm0
   1007 ; X32-SSE-NEXT:    retl
   1008   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   1009   %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
   1010   %shl = shl <8 x i16> %a, %splat
   1011   %lshr = lshr <8 x i16> %a, %splat16
   1012   %or = or <8 x i16> %shl, %lshr
   1013   ret <8 x i16> %or
   1014 }
   1015 
   1016 define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
   1017 ; SSE2-LABEL: splatvar_rotate_v16i8:
   1018 ; SSE2:       # %bb.0:
   1019 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1020 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1021 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
   1022 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
   1023 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
   1024 ; SSE2-NEXT:    psrlw $4, %xmm0
   1025 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1026 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
   1027 ; SSE2-NEXT:    psllw $4, %xmm3
   1028 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1029 ; SSE2-NEXT:    por %xmm0, %xmm3
   1030 ; SSE2-NEXT:    psllw $5, %xmm1
   1031 ; SSE2-NEXT:    pxor %xmm0, %xmm0
   1032 ; SSE2-NEXT:    pxor %xmm4, %xmm4
   1033 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm4
   1034 ; SSE2-NEXT:    pand %xmm4, %xmm3
   1035 ; SSE2-NEXT:    pandn %xmm2, %xmm4
   1036 ; SSE2-NEXT:    por %xmm3, %xmm4
   1037 ; SSE2-NEXT:    movdqa %xmm4, %xmm2
   1038 ; SSE2-NEXT:    psrlw $6, %xmm2
   1039 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
   1040 ; SSE2-NEXT:    movdqa %xmm4, %xmm3
   1041 ; SSE2-NEXT:    psllw $2, %xmm3
   1042 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1043 ; SSE2-NEXT:    por %xmm2, %xmm3
   1044 ; SSE2-NEXT:    paddb %xmm1, %xmm1
   1045 ; SSE2-NEXT:    pxor %xmm2, %xmm2
   1046 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
   1047 ; SSE2-NEXT:    pand %xmm2, %xmm3
   1048 ; SSE2-NEXT:    pandn %xmm4, %xmm2
   1049 ; SSE2-NEXT:    por %xmm3, %xmm2
   1050 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
   1051 ; SSE2-NEXT:    paddb %xmm2, %xmm3
   1052 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   1053 ; SSE2-NEXT:    psrlw $7, %xmm4
   1054 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
   1055 ; SSE2-NEXT:    por %xmm3, %xmm4
   1056 ; SSE2-NEXT:    paddb %xmm1, %xmm1
   1057 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
   1058 ; SSE2-NEXT:    pand %xmm0, %xmm4
   1059 ; SSE2-NEXT:    pandn %xmm2, %xmm0
   1060 ; SSE2-NEXT:    por %xmm4, %xmm0
   1061 ; SSE2-NEXT:    retq
   1062 ;
   1063 ; SSE41-LABEL: splatvar_rotate_v16i8:
   1064 ; SSE41:       # %bb.0:
   1065 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
   1066 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
   1067 ; SSE41-NEXT:    pxor %xmm0, %xmm0
   1068 ; SSE41-NEXT:    pshufb %xmm0, %xmm2
   1069 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
   1070 ; SSE41-NEXT:    psrlw $4, %xmm0
   1071 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
   1072 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
   1073 ; SSE41-NEXT:    psllw $4, %xmm3
   1074 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
   1075 ; SSE41-NEXT:    por %xmm0, %xmm3
   1076 ; SSE41-NEXT:    psllw $5, %xmm2
   1077 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
   1078 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
   1079 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
   1080 ; SSE41-NEXT:    psrlw $6, %xmm0
   1081 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
   1082 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
   1083 ; SSE41-NEXT:    psllw $2, %xmm3
   1084 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
   1085 ; SSE41-NEXT:    por %xmm0, %xmm3
   1086 ; SSE41-NEXT:    paddb %xmm2, %xmm2
   1087 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
   1088 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
   1089 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
   1090 ; SSE41-NEXT:    paddb %xmm1, %xmm0
   1091 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
   1092 ; SSE41-NEXT:    psrlw $7, %xmm3
   1093 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
   1094 ; SSE41-NEXT:    por %xmm0, %xmm3
   1095 ; SSE41-NEXT:    paddb %xmm2, %xmm2
   1096 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
   1097 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
   1098 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
   1099 ; SSE41-NEXT:    retq
   1100 ;
   1101 ; AVX1-LABEL: splatvar_rotate_v16i8:
   1102 ; AVX1:       # %bb.0:
   1103 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1104 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1105 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
   1106 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1107 ; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
   1108 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1109 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1110 ; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
   1111 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1112 ; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm2
   1113 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1114 ; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
   1115 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1116 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1117 ; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
   1118 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1119 ; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
   1120 ; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
   1121 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1122 ; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
   1123 ; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
   1124 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1125 ; AVX1-NEXT:    retq
   1126 ;
   1127 ; AVX2-LABEL: splatvar_rotate_v16i8:
   1128 ; AVX2:       # %bb.0:
   1129 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
   1130 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm2
   1131 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1132 ; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm3
   1133 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1134 ; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1135 ; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
   1136 ; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1137 ; AVX2-NEXT:    vpsrlw $6, %xmm0, %xmm2
   1138 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1139 ; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm3
   1140 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1141 ; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1142 ; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
   1143 ; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1144 ; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
   1145 ; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm3
   1146 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1147 ; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
   1148 ; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
   1149 ; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1150 ; AVX2-NEXT:    retq
   1151 ;
   1152 ; AVX512F-LABEL: splatvar_rotate_v16i8:
   1153 ; AVX512F:       # %bb.0:
   1154 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
   1155 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm2
   1156 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1157 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm3
   1158 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1159 ; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1160 ; AVX512F-NEXT:    vpsllw $5, %xmm1, %xmm1
   1161 ; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1162 ; AVX512F-NEXT:    vpsrlw $6, %xmm0, %xmm2
   1163 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1164 ; AVX512F-NEXT:    vpsllw $2, %xmm0, %xmm3
   1165 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1166 ; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1167 ; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
   1168 ; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1169 ; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
   1170 ; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm3
   1171 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1172 ; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
   1173 ; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
   1174 ; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1175 ; AVX512F-NEXT:    retq
   1176 ;
   1177 ; AVX512VL-LABEL: splatvar_rotate_v16i8:
   1178 ; AVX512VL:       # %bb.0:
   1179 ; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
   1180 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm2
   1181 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1182 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm3
   1183 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1184 ; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1185 ; AVX512VL-NEXT:    vpsllw $5, %xmm1, %xmm1
   1186 ; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1187 ; AVX512VL-NEXT:    vpsrlw $6, %xmm0, %xmm2
   1188 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1189 ; AVX512VL-NEXT:    vpsllw $2, %xmm0, %xmm3
   1190 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1191 ; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1192 ; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
   1193 ; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1194 ; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
   1195 ; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm3
   1196 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1197 ; AVX512VL-NEXT:    vpor %xmm3, %xmm2, %xmm2
   1198 ; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
   1199 ; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1200 ; AVX512VL-NEXT:    retq
   1201 ;
   1202 ; AVX512BW-LABEL: splatvar_rotate_v16i8:
   1203 ; AVX512BW:       # %bb.0:
   1204 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
   1205 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
   1206 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
   1207 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
   1208 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
   1209 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   1210 ; AVX512BW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
   1211 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
   1212 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
   1213 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1214 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1215 ; AVX512BW-NEXT:    vzeroupper
   1216 ; AVX512BW-NEXT:    retq
   1217 ;
   1218 ; AVX512VLBW-LABEL: splatvar_rotate_v16i8:
   1219 ; AVX512VLBW:       # %bb.0:
   1220 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
   1221 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
   1222 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
   1223 ; AVX512VLBW-NEXT:    vpsllvw %ymm2, %ymm0, %ymm2
   1224 ; AVX512VLBW-NEXT:    vpmovwb %ymm2, %xmm2
   1225 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   1226 ; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
   1227 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
   1228 ; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
   1229 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
   1230 ; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1231 ; AVX512VLBW-NEXT:    vzeroupper
   1232 ; AVX512VLBW-NEXT:    retq
   1233 ;
   1234 ; XOPAVX1-LABEL: splatvar_rotate_v16i8:
   1235 ; XOPAVX1:       # %bb.0:
   1236 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1237 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1238 ; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
   1239 ; XOPAVX1-NEXT:    retq
   1240 ;
   1241 ; XOPAVX2-LABEL: splatvar_rotate_v16i8:
   1242 ; XOPAVX2:       # %bb.0:
   1243 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
   1244 ; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
   1245 ; XOPAVX2-NEXT:    retq
   1246 ;
   1247 ; X32-SSE-LABEL: splatvar_rotate_v16i8:
   1248 ; X32-SSE:       # %bb.0:
   1249 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
   1250 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1251 ; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
   1252 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
   1253 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
   1254 ; X32-SSE-NEXT:    psrlw $4, %xmm0
   1255 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   1256 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
   1257 ; X32-SSE-NEXT:    psllw $4, %xmm3
   1258 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
   1259 ; X32-SSE-NEXT:    por %xmm0, %xmm3
   1260 ; X32-SSE-NEXT:    psllw $5, %xmm1
   1261 ; X32-SSE-NEXT:    pxor %xmm0, %xmm0
   1262 ; X32-SSE-NEXT:    pxor %xmm4, %xmm4
   1263 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm4
   1264 ; X32-SSE-NEXT:    pand %xmm4, %xmm3
   1265 ; X32-SSE-NEXT:    pandn %xmm2, %xmm4
   1266 ; X32-SSE-NEXT:    por %xmm3, %xmm4
   1267 ; X32-SSE-NEXT:    movdqa %xmm4, %xmm2
   1268 ; X32-SSE-NEXT:    psrlw $6, %xmm2
   1269 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
   1270 ; X32-SSE-NEXT:    movdqa %xmm4, %xmm3
   1271 ; X32-SSE-NEXT:    psllw $2, %xmm3
   1272 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
   1273 ; X32-SSE-NEXT:    por %xmm2, %xmm3
   1274 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
   1275 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
   1276 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
   1277 ; X32-SSE-NEXT:    pand %xmm2, %xmm3
   1278 ; X32-SSE-NEXT:    pandn %xmm4, %xmm2
   1279 ; X32-SSE-NEXT:    por %xmm3, %xmm2
   1280 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
   1281 ; X32-SSE-NEXT:    paddb %xmm2, %xmm3
   1282 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
   1283 ; X32-SSE-NEXT:    psrlw $7, %xmm4
   1284 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
   1285 ; X32-SSE-NEXT:    por %xmm3, %xmm4
   1286 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
   1287 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm0
   1288 ; X32-SSE-NEXT:    pand %xmm0, %xmm4
   1289 ; X32-SSE-NEXT:    pandn %xmm2, %xmm0
   1290 ; X32-SSE-NEXT:    por %xmm4, %xmm0
   1291 ; X32-SSE-NEXT:    retl
   1292   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   1293   %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
   1294   %shl = shl <16 x i8> %a, %splat
   1295   %lshr = lshr <16 x i8> %a, %splat8
   1296   %or = or <16 x i8> %shl, %lshr
   1297   ret <16 x i8> %or
   1298 }
   1299 
   1300 ;
   1301 ; Constant Rotates
   1302 ;
   1303 
   1304 define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
   1305 ; SSE2-LABEL: constant_rotate_v2i64:
   1306 ; SSE2:       # %bb.0:
   1307 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   1308 ; SSE2-NEXT:    psllq $4, %xmm1
   1309 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1310 ; SSE2-NEXT:    psllq $14, %xmm2
   1311 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
   1312 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   1313 ; SSE2-NEXT:    psrlq $60, %xmm1
   1314 ; SSE2-NEXT:    psrlq $50, %xmm0
   1315 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1316 ; SSE2-NEXT:    orpd %xmm2, %xmm0
   1317 ; SSE2-NEXT:    retq
   1318 ;
   1319 ; SSE41-LABEL: constant_rotate_v2i64:
   1320 ; SSE41:       # %bb.0:
   1321 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
   1322 ; SSE41-NEXT:    psllq $14, %xmm1
   1323 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
   1324 ; SSE41-NEXT:    psllq $4, %xmm2
   1325 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
   1326 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
   1327 ; SSE41-NEXT:    psrlq $50, %xmm1
   1328 ; SSE41-NEXT:    psrlq $60, %xmm0
   1329 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
   1330 ; SSE41-NEXT:    por %xmm2, %xmm0
   1331 ; SSE41-NEXT:    retq
   1332 ;
   1333 ; AVX1-LABEL: constant_rotate_v2i64:
   1334 ; AVX1:       # %bb.0:
   1335 ; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
   1336 ; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm2
   1337 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
   1338 ; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm2
   1339 ; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
   1340 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
   1341 ; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
   1342 ; AVX1-NEXT:    retq
   1343 ;
   1344 ; AVX2-LABEL: constant_rotate_v2i64:
   1345 ; AVX2:       # %bb.0:
   1346 ; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
   1347 ; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
   1348 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
   1349 ; AVX2-NEXT:    retq
   1350 ;
   1351 ; AVX512F-LABEL: constant_rotate_v2i64:
   1352 ; AVX512F:       # %bb.0:
   1353 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1354 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
   1355 ; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
   1356 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1357 ; AVX512F-NEXT:    vzeroupper
   1358 ; AVX512F-NEXT:    retq
   1359 ;
   1360 ; AVX512VL-LABEL: constant_rotate_v2i64:
   1361 ; AVX512VL:       # %bb.0:
   1362 ; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
   1363 ; AVX512VL-NEXT:    retq
   1364 ;
   1365 ; AVX512BW-LABEL: constant_rotate_v2i64:
   1366 ; AVX512BW:       # %bb.0:
   1367 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1368 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
   1369 ; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
   1370 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1371 ; AVX512BW-NEXT:    vzeroupper
   1372 ; AVX512BW-NEXT:    retq
   1373 ;
   1374 ; AVX512VLBW-LABEL: constant_rotate_v2i64:
   1375 ; AVX512VLBW:       # %bb.0:
   1376 ; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
   1377 ; AVX512VLBW-NEXT:    retq
   1378 ;
   1379 ; XOP-LABEL: constant_rotate_v2i64:
   1380 ; XOP:       # %bb.0:
   1381 ; XOP-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
   1382 ; XOP-NEXT:    retq
   1383 ;
   1384 ; X32-SSE-LABEL: constant_rotate_v2i64:
   1385 ; X32-SSE:       # %bb.0:
   1386 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   1387 ; X32-SSE-NEXT:    psllq $4, %xmm1
   1388 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
   1389 ; X32-SSE-NEXT:    psllq $14, %xmm2
   1390 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
   1391 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   1392 ; X32-SSE-NEXT:    psrlq $60, %xmm1
   1393 ; X32-SSE-NEXT:    psrlq $50, %xmm0
   1394 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1395 ; X32-SSE-NEXT:    orpd %xmm2, %xmm0
   1396 ; X32-SSE-NEXT:    retl
   1397   %shl = shl <2 x i64> %a, <i64 4, i64 14>
   1398   %lshr = lshr <2 x i64> %a, <i64 60, i64 50>
   1399   %or = or <2 x i64> %shl, %lshr
   1400   ret <2 x i64> %or
   1401 }
   1402 
   1403 define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
   1404 ; SSE2-LABEL: constant_rotate_v4i32:
   1405 ; SSE2:       # %bb.0:
   1406 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
   1407 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
   1408 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
   1409 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
   1410 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
   1411 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
   1412 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
   1413 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
   1414 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1415 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1416 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1417 ; SSE2-NEXT:    por %xmm3, %xmm0
   1418 ; SSE2-NEXT:    retq
   1419 ;
   1420 ; SSE41-LABEL: constant_rotate_v4i32:
   1421 ; SSE41:       # %bb.0:
   1422 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
   1423 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
   1424 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
   1425 ; SSE41-NEXT:    pmuludq %xmm2, %xmm3
   1426 ; SSE41-NEXT:    pmuludq %xmm1, %xmm0
   1427 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1428 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
   1429 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
   1430 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
   1431 ; SSE41-NEXT:    por %xmm1, %xmm0
   1432 ; SSE41-NEXT:    retq
   1433 ;
   1434 ; AVX1-LABEL: constant_rotate_v4i32:
   1435 ; AVX1:       # %bb.0:
   1436 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,32,64,128]
   1437 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
   1438 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
   1439 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
   1440 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
   1441 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1442 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
   1443 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
   1444 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
   1445 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1446 ; AVX1-NEXT:    retq
   1447 ;
   1448 ; AVX2-LABEL: constant_rotate_v4i32:
   1449 ; AVX2:       # %bb.0:
   1450 ; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
   1451 ; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
   1452 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1453 ; AVX2-NEXT:    retq
   1454 ;
   1455 ; AVX512F-LABEL: constant_rotate_v4i32:
   1456 ; AVX512F:       # %bb.0:
   1457 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1458 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
   1459 ; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
   1460 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1461 ; AVX512F-NEXT:    vzeroupper
   1462 ; AVX512F-NEXT:    retq
   1463 ;
   1464 ; AVX512VL-LABEL: constant_rotate_v4i32:
   1465 ; AVX512VL:       # %bb.0:
   1466 ; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
   1467 ; AVX512VL-NEXT:    retq
   1468 ;
   1469 ; AVX512BW-LABEL: constant_rotate_v4i32:
   1470 ; AVX512BW:       # %bb.0:
   1471 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1472 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
   1473 ; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
   1474 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1475 ; AVX512BW-NEXT:    vzeroupper
   1476 ; AVX512BW-NEXT:    retq
   1477 ;
   1478 ; AVX512VLBW-LABEL: constant_rotate_v4i32:
   1479 ; AVX512VLBW:       # %bb.0:
   1480 ; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
   1481 ; AVX512VLBW-NEXT:    retq
   1482 ;
   1483 ; XOP-LABEL: constant_rotate_v4i32:
   1484 ; XOP:       # %bb.0:
   1485 ; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
   1486 ; XOP-NEXT:    retq
   1487 ;
   1488 ; X32-SSE-LABEL: constant_rotate_v4i32:
   1489 ; X32-SSE:       # %bb.0:
   1490 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
   1491 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
   1492 ; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
   1493 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
   1494 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
   1495 ; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
   1496 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
   1497 ; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
   1498 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1499 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1500 ; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1501 ; X32-SSE-NEXT:    por %xmm3, %xmm0
   1502 ; X32-SSE-NEXT:    retl
   1503   %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
   1504   %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25>
   1505   %or = or <4 x i32> %shl, %lshr
   1506   ret <4 x i32> %or
   1507 }
   1508 
   1509 define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
   1510 ; SSE-LABEL: constant_rotate_v8i16:
   1511 ; SSE:       # %bb.0:
   1512 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
   1513 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   1514 ; SSE-NEXT:    pmulhuw %xmm1, %xmm2
   1515 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   1516 ; SSE-NEXT:    por %xmm2, %xmm0
   1517 ; SSE-NEXT:    retq
   1518 ;
   1519 ; AVX-LABEL: constant_rotate_v8i16:
   1520 ; AVX:       # %bb.0:
   1521 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
   1522 ; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
   1523 ; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1524 ; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
   1525 ; AVX-NEXT:    retq
   1526 ;
   1527 ; AVX512F-LABEL: constant_rotate_v8i16:
   1528 ; AVX512F:       # %bb.0:
   1529 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
   1530 ; AVX512F-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
   1531 ; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1532 ; AVX512F-NEXT:    vpor %xmm2, %xmm0, %xmm0
   1533 ; AVX512F-NEXT:    retq
   1534 ;
   1535 ; AVX512VL-LABEL: constant_rotate_v8i16:
   1536 ; AVX512VL:       # %bb.0:
   1537 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
   1538 ; AVX512VL-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
   1539 ; AVX512VL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1540 ; AVX512VL-NEXT:    vpor %xmm2, %xmm0, %xmm0
   1541 ; AVX512VL-NEXT:    retq
   1542 ;
   1543 ; AVX512BW-LABEL: constant_rotate_v8i16:
   1544 ; AVX512BW:       # %bb.0:
   1545 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1546 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
   1547 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
   1548 ; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
   1549 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
   1550 ; AVX512BW-NEXT:    vpor %xmm2, %xmm0, %xmm0
   1551 ; AVX512BW-NEXT:    vzeroupper
   1552 ; AVX512BW-NEXT:    retq
   1553 ;
   1554 ; AVX512VLBW-LABEL: constant_rotate_v8i16:
   1555 ; AVX512VLBW:       # %bb.0:
   1556 ; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm1
   1557 ; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
   1558 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1559 ; AVX512VLBW-NEXT:    retq
   1560 ;
   1561 ; XOP-LABEL: constant_rotate_v8i16:
   1562 ; XOP:       # %bb.0:
   1563 ; XOP-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
   1564 ; XOP-NEXT:    retq
   1565 ;
   1566 ; X32-SSE-LABEL: constant_rotate_v8i16:
   1567 ; X32-SSE:       # %bb.0:
   1568 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
   1569 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
   1570 ; X32-SSE-NEXT:    pmulhuw %xmm1, %xmm2
   1571 ; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
   1572 ; X32-SSE-NEXT:    por %xmm2, %xmm0
   1573 ; X32-SSE-NEXT:    retl
   1574   %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   1575   %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9>
   1576   %or = or <8 x i16> %shl, %lshr
   1577   ret <8 x i16> %or
   1578 }
   1579 
   1580 define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
   1581 ; SSE2-LABEL: constant_rotate_v16i8:
   1582 ; SSE2:       # %bb.0:
   1583 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   1584 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
   1585 ; SSE2-NEXT:    pxor %xmm0, %xmm0
   1586 ; SSE2-NEXT:    pxor %xmm3, %xmm3
   1587 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
   1588 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1589 ; SSE2-NEXT:    psrlw $4, %xmm4
   1590 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
   1591 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1592 ; SSE2-NEXT:    psllw $4, %xmm5
   1593 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm5
   1594 ; SSE2-NEXT:    por %xmm4, %xmm5
   1595 ; SSE2-NEXT:    pand %xmm3, %xmm5
   1596 ; SSE2-NEXT:    pandn %xmm1, %xmm3
   1597 ; SSE2-NEXT:    por %xmm5, %xmm3
   1598 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
   1599 ; SSE2-NEXT:    psrlw $6, %xmm1
   1600 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
   1601 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   1602 ; SSE2-NEXT:    psllw $2, %xmm4
   1603 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
   1604 ; SSE2-NEXT:    por %xmm1, %xmm4
   1605 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1606 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1607 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
   1608 ; SSE2-NEXT:    pand %xmm1, %xmm4
   1609 ; SSE2-NEXT:    pandn %xmm3, %xmm1
   1610 ; SSE2-NEXT:    por %xmm4, %xmm1
   1611 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1612 ; SSE2-NEXT:    paddb %xmm1, %xmm3
   1613 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1614 ; SSE2-NEXT:    psrlw $7, %xmm4
   1615 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
   1616 ; SSE2-NEXT:    por %xmm3, %xmm4
   1617 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1618 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
   1619 ; SSE2-NEXT:    pand %xmm0, %xmm4
   1620 ; SSE2-NEXT:    pandn %xmm1, %xmm0
   1621 ; SSE2-NEXT:    por %xmm4, %xmm0
   1622 ; SSE2-NEXT:    retq
   1623 ;
   1624 ; SSE41-LABEL: constant_rotate_v16i8:
   1625 ; SSE41:       # %bb.0:
   1626 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
   1627 ; SSE41-NEXT:    psrlw $4, %xmm0
   1628 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
   1629 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
   1630 ; SSE41-NEXT:    psllw $4, %xmm2
   1631 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
   1632 ; SSE41-NEXT:    por %xmm0, %xmm2
   1633 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,57600,41152,24704,8256]
   1634 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
   1635 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
   1636 ; SSE41-NEXT:    psrlw $6, %xmm2
   1637 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
   1638 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
   1639 ; SSE41-NEXT:    psllw $2, %xmm3
   1640 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
   1641 ; SSE41-NEXT:    por %xmm2, %xmm3
   1642 ; SSE41-NEXT:    paddb %xmm0, %xmm0
   1643 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
   1644 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
   1645 ; SSE41-NEXT:    paddb %xmm1, %xmm2
   1646 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
   1647 ; SSE41-NEXT:    psrlw $7, %xmm3
   1648 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
   1649 ; SSE41-NEXT:    por %xmm2, %xmm3
   1650 ; SSE41-NEXT:    paddb %xmm0, %xmm0
   1651 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
   1652 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
   1653 ; SSE41-NEXT:    retq
   1654 ;
   1655 ; AVX-LABEL: constant_rotate_v16i8:
   1656 ; AVX:       # %bb.0:
   1657 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
   1658 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1659 ; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
   1660 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1661 ; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
   1662 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
   1663 ; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1664 ; AVX-NEXT:    vpsrlw $6, %xmm0, %xmm1
   1665 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1666 ; AVX-NEXT:    vpsllw $2, %xmm0, %xmm3
   1667 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1668 ; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
   1669 ; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
   1670 ; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1671 ; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
   1672 ; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm3
   1673 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1674 ; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
   1675 ; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
   1676 ; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1677 ; AVX-NEXT:    retq
   1678 ;
   1679 ; AVX512F-LABEL: constant_rotate_v16i8:
   1680 ; AVX512F:       # %bb.0:
   1681 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm1
   1682 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1683 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
   1684 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1685 ; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
   1686 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
   1687 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1688 ; AVX512F-NEXT:    vpsrlw $6, %xmm0, %xmm1
   1689 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1690 ; AVX512F-NEXT:    vpsllw $2, %xmm0, %xmm3
   1691 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1692 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
   1693 ; AVX512F-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
   1694 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1695 ; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
   1696 ; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm3
   1697 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1698 ; AVX512F-NEXT:    vpor %xmm3, %xmm1, %xmm1
   1699 ; AVX512F-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
   1700 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1701 ; AVX512F-NEXT:    retq
   1702 ;
   1703 ; AVX512VL-LABEL: constant_rotate_v16i8:
   1704 ; AVX512VL:       # %bb.0:
   1705 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm1
   1706 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1707 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
   1708 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
   1709 ; AVX512VL-NEXT:    vpor %xmm1, %xmm2, %xmm1
   1710 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
   1711 ; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1712 ; AVX512VL-NEXT:    vpsrlw $6, %xmm0, %xmm1
   1713 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1714 ; AVX512VL-NEXT:    vpsllw $2, %xmm0, %xmm3
   1715 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1716 ; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
   1717 ; AVX512VL-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
   1718 ; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1719 ; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
   1720 ; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm3
   1721 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
   1722 ; AVX512VL-NEXT:    vpor %xmm3, %xmm1, %xmm1
   1723 ; AVX512VL-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
   1724 ; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1725 ; AVX512VL-NEXT:    retq
   1726 ;
   1727 ; AVX512BW-LABEL: constant_rotate_v16i8:
   1728 ; AVX512BW:       # %bb.0:
   1729 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
   1730 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
   1731 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
   1732 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
   1733 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
   1734 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
   1735 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1736 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1737 ; AVX512BW-NEXT:    vzeroupper
   1738 ; AVX512BW-NEXT:    retq
   1739 ;
   1740 ; AVX512VLBW-LABEL: constant_rotate_v16i8:
   1741 ; AVX512VLBW:       # %bb.0:
   1742 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
   1743 ; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
   1744 ; AVX512VLBW-NEXT:    vpmovwb %ymm1, %xmm1
   1745 ; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
   1746 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
   1747 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1748 ; AVX512VLBW-NEXT:    vzeroupper
   1749 ; AVX512VLBW-NEXT:    retq
   1750 ;
   1751 ; XOP-LABEL: constant_rotate_v16i8:
   1752 ; XOP:       # %bb.0:
   1753 ; XOP-NEXT:    vprotb {{.*}}(%rip), %xmm0, %xmm0
   1754 ; XOP-NEXT:    retq
   1755 ;
   1756 ; X32-SSE-LABEL: constant_rotate_v16i8:
   1757 ; X32-SSE:       # %bb.0:
   1758 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   1759 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
   1760 ; X32-SSE-NEXT:    pxor %xmm0, %xmm0
   1761 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
   1762 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
   1763 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
   1764 ; X32-SSE-NEXT:    psrlw $4, %xmm4
   1765 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
   1766 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
   1767 ; X32-SSE-NEXT:    psllw $4, %xmm5
   1768 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm5
   1769 ; X32-SSE-NEXT:    por %xmm4, %xmm5
   1770 ; X32-SSE-NEXT:    pand %xmm3, %xmm5
   1771 ; X32-SSE-NEXT:    pandn %xmm1, %xmm3
   1772 ; X32-SSE-NEXT:    por %xmm5, %xmm3
   1773 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
   1774 ; X32-SSE-NEXT:    psrlw $6, %xmm1
   1775 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
   1776 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
   1777 ; X32-SSE-NEXT:    psllw $2, %xmm4
   1778 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
   1779 ; X32-SSE-NEXT:    por %xmm1, %xmm4
   1780 ; X32-SSE-NEXT:    paddb %xmm2, %xmm2
   1781 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
   1782 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
   1783 ; X32-SSE-NEXT:    pand %xmm1, %xmm4
   1784 ; X32-SSE-NEXT:    pandn %xmm3, %xmm1
   1785 ; X32-SSE-NEXT:    por %xmm4, %xmm1
   1786 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
   1787 ; X32-SSE-NEXT:    paddb %xmm1, %xmm3
   1788 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
   1789 ; X32-SSE-NEXT:    psrlw $7, %xmm4
   1790 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
   1791 ; X32-SSE-NEXT:    por %xmm3, %xmm4
   1792 ; X32-SSE-NEXT:    paddb %xmm2, %xmm2
   1793 ; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm0
   1794 ; X32-SSE-NEXT:    pand %xmm0, %xmm4
   1795 ; X32-SSE-NEXT:    pandn %xmm1, %xmm0
   1796 ; X32-SSE-NEXT:    por %xmm4, %xmm0
   1797 ; X32-SSE-NEXT:    retl
   1798   %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
   1799   %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
   1800   %or = or <16 x i8> %shl, %lshr
   1801   ret <16 x i8> %or
   1802 }
   1803 
   1804 ;
   1805 ; Uniform Constant Rotates
   1806 ;
   1807 
   1808 define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
   1809 ; SSE-LABEL: splatconstant_rotate_v2i64:
   1810 ; SSE:       # %bb.0:
   1811 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1812 ; SSE-NEXT:    psllq $14, %xmm1
   1813 ; SSE-NEXT:    psrlq $50, %xmm0
   1814 ; SSE-NEXT:    por %xmm1, %xmm0
   1815 ; SSE-NEXT:    retq
   1816 ;
   1817 ; AVX-LABEL: splatconstant_rotate_v2i64:
   1818 ; AVX:       # %bb.0:
   1819 ; AVX-NEXT:    vpsllq $14, %xmm0, %xmm1
   1820 ; AVX-NEXT:    vpsrlq $50, %xmm0, %xmm0
   1821 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
   1822 ; AVX-NEXT:    retq
   1823 ;
   1824 ; AVX512F-LABEL: splatconstant_rotate_v2i64:
   1825 ; AVX512F:       # %bb.0:
   1826 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1827 ; AVX512F-NEXT:    vprolq $14, %zmm0, %zmm0
   1828 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1829 ; AVX512F-NEXT:    vzeroupper
   1830 ; AVX512F-NEXT:    retq
   1831 ;
   1832 ; AVX512VL-LABEL: splatconstant_rotate_v2i64:
   1833 ; AVX512VL:       # %bb.0:
   1834 ; AVX512VL-NEXT:    vprolq $14, %xmm0, %xmm0
   1835 ; AVX512VL-NEXT:    retq
   1836 ;
   1837 ; AVX512BW-LABEL: splatconstant_rotate_v2i64:
   1838 ; AVX512BW:       # %bb.0:
   1839 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1840 ; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
   1841 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1842 ; AVX512BW-NEXT:    vzeroupper
   1843 ; AVX512BW-NEXT:    retq
   1844 ;
   1845 ; AVX512VLBW-LABEL: splatconstant_rotate_v2i64:
   1846 ; AVX512VLBW:       # %bb.0:
   1847 ; AVX512VLBW-NEXT:    vprolq $14, %xmm0, %xmm0
   1848 ; AVX512VLBW-NEXT:    retq
   1849 ;
   1850 ; XOP-LABEL: splatconstant_rotate_v2i64:
   1851 ; XOP:       # %bb.0:
   1852 ; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
   1853 ; XOP-NEXT:    retq
   1854 ;
   1855 ; X32-SSE-LABEL: splatconstant_rotate_v2i64:
   1856 ; X32-SSE:       # %bb.0:
   1857 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   1858 ; X32-SSE-NEXT:    psllq $14, %xmm1
   1859 ; X32-SSE-NEXT:    psrlq $50, %xmm0
   1860 ; X32-SSE-NEXT:    por %xmm1, %xmm0
   1861 ; X32-SSE-NEXT:    retl
   1862   %shl = shl <2 x i64> %a, <i64 14, i64 14>
   1863   %lshr = lshr <2 x i64> %a, <i64 50, i64 50>
   1864   %or = or <2 x i64> %shl, %lshr
   1865   ret <2 x i64> %or
   1866 }
   1867 
   1868 define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
   1869 ; SSE-LABEL: splatconstant_rotate_v4i32:
   1870 ; SSE:       # %bb.0:
   1871 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1872 ; SSE-NEXT:    psrld $28, %xmm1
   1873 ; SSE-NEXT:    pslld $4, %xmm0
   1874 ; SSE-NEXT:    por %xmm1, %xmm0
   1875 ; SSE-NEXT:    retq
   1876 ;
   1877 ; AVX-LABEL: splatconstant_rotate_v4i32:
   1878 ; AVX:       # %bb.0:
   1879 ; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
   1880 ; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
   1881 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1882 ; AVX-NEXT:    retq
   1883 ;
   1884 ; AVX512F-LABEL: splatconstant_rotate_v4i32:
   1885 ; AVX512F:       # %bb.0:
   1886 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1887 ; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
   1888 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1889 ; AVX512F-NEXT:    vzeroupper
   1890 ; AVX512F-NEXT:    retq
   1891 ;
   1892 ; AVX512VL-LABEL: splatconstant_rotate_v4i32:
   1893 ; AVX512VL:       # %bb.0:
   1894 ; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
   1895 ; AVX512VL-NEXT:    retq
   1896 ;
   1897 ; AVX512BW-LABEL: splatconstant_rotate_v4i32:
   1898 ; AVX512BW:       # %bb.0:
   1899 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1900 ; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
   1901 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   1902 ; AVX512BW-NEXT:    vzeroupper
   1903 ; AVX512BW-NEXT:    retq
   1904 ;
   1905 ; AVX512VLBW-LABEL: splatconstant_rotate_v4i32:
   1906 ; AVX512VLBW:       # %bb.0:
   1907 ; AVX512VLBW-NEXT:    vprold $4, %xmm0, %xmm0
   1908 ; AVX512VLBW-NEXT:    retq
   1909 ;
   1910 ; XOP-LABEL: splatconstant_rotate_v4i32:
   1911 ; XOP:       # %bb.0:
   1912 ; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
   1913 ; XOP-NEXT:    retq
   1914 ;
   1915 ; X32-SSE-LABEL: splatconstant_rotate_v4i32:
   1916 ; X32-SSE:       # %bb.0:
   1917 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   1918 ; X32-SSE-NEXT:    psrld $28, %xmm1
   1919 ; X32-SSE-NEXT:    pslld $4, %xmm0
   1920 ; X32-SSE-NEXT:    por %xmm1, %xmm0
   1921 ; X32-SSE-NEXT:    retl
   1922   %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
   1923   %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
   1924   %or = or <4 x i32> %shl, %lshr
   1925   ret <4 x i32> %or
   1926 }
   1927 
   1928 define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
   1929 ; SSE-LABEL: splatconstant_rotate_v8i16:
   1930 ; SSE:       # %bb.0:
   1931 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1932 ; SSE-NEXT:    psrlw $9, %xmm1
   1933 ; SSE-NEXT:    psllw $7, %xmm0
   1934 ; SSE-NEXT:    por %xmm1, %xmm0
   1935 ; SSE-NEXT:    retq
   1936 ;
   1937 ; AVX-LABEL: splatconstant_rotate_v8i16:
   1938 ; AVX:       # %bb.0:
   1939 ; AVX-NEXT:    vpsrlw $9, %xmm0, %xmm1
   1940 ; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0
   1941 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1942 ; AVX-NEXT:    retq
   1943 ;
   1944 ; AVX512-LABEL: splatconstant_rotate_v8i16:
   1945 ; AVX512:       # %bb.0:
   1946 ; AVX512-NEXT:    vpsrlw $9, %xmm0, %xmm1
   1947 ; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm0
   1948 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1949 ; AVX512-NEXT:    retq
   1950 ;
   1951 ; XOP-LABEL: splatconstant_rotate_v8i16:
   1952 ; XOP:       # %bb.0:
   1953 ; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
   1954 ; XOP-NEXT:    retq
   1955 ;
   1956 ; X32-SSE-LABEL: splatconstant_rotate_v8i16:
   1957 ; X32-SSE:       # %bb.0:
   1958 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   1959 ; X32-SSE-NEXT:    psrlw $9, %xmm1
   1960 ; X32-SSE-NEXT:    psllw $7, %xmm0
   1961 ; X32-SSE-NEXT:    por %xmm1, %xmm0
   1962 ; X32-SSE-NEXT:    retl
   1963   %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   1964   %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
   1965   %or = or <8 x i16> %shl, %lshr
   1966   ret <8 x i16> %or
   1967 }
   1968 
   1969 define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
   1970 ; SSE-LABEL: splatconstant_rotate_v16i8:
   1971 ; SSE:       # %bb.0:
   1972 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1973 ; SSE-NEXT:    psrlw $4, %xmm1
   1974 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
   1975 ; SSE-NEXT:    psllw $4, %xmm0
   1976 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   1977 ; SSE-NEXT:    por %xmm1, %xmm0
   1978 ; SSE-NEXT:    retq
   1979 ;
   1980 ; AVX-LABEL: splatconstant_rotate_v16i8:
   1981 ; AVX:       # %bb.0:
   1982 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
   1983 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1984 ; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
   1985 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   1986 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1987 ; AVX-NEXT:    retq
   1988 ;
   1989 ; AVX512-LABEL: splatconstant_rotate_v16i8:
   1990 ; AVX512:       # %bb.0:
   1991 ; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
   1992 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   1993 ; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
   1994 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   1995 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1996 ; AVX512-NEXT:    retq
   1997 ;
   1998 ; XOP-LABEL: splatconstant_rotate_v16i8:
   1999 ; XOP:       # %bb.0:
   2000 ; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
   2001 ; XOP-NEXT:    retq
   2002 ;
   2003 ; X32-SSE-LABEL: splatconstant_rotate_v16i8:
   2004 ; X32-SSE:       # %bb.0:
   2005 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   2006 ; X32-SSE-NEXT:    psrlw $4, %xmm1
   2007 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
   2008 ; X32-SSE-NEXT:    psllw $4, %xmm0
   2009 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   2010 ; X32-SSE-NEXT:    por %xmm1, %xmm0
   2011 ; X32-SSE-NEXT:    retl
   2012   %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   2013   %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   2014   %or = or <16 x i8> %shl, %lshr
   2015   ret <16 x i8> %or
   2016 }
   2017 
   2018 ;
   2019 ; Masked Uniform Constant Rotates
   2020 ;
   2021 
   2022 define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
   2023 ; SSE-LABEL: splatconstant_rotate_mask_v2i64:
   2024 ; SSE:       # %bb.0:
   2025 ; SSE-NEXT:    psrlq $49, %xmm0
   2026 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   2027 ; SSE-NEXT:    retq
   2028 ;
   2029 ; AVX-LABEL: splatconstant_rotate_mask_v2i64:
   2030 ; AVX:       # %bb.0:
   2031 ; AVX-NEXT:    vpsrlq $49, %xmm0, %xmm0
   2032 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2033 ; AVX-NEXT:    retq
   2034 ;
   2035 ; AVX512F-LABEL: splatconstant_rotate_mask_v2i64:
   2036 ; AVX512F:       # %bb.0:
   2037 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   2038 ; AVX512F-NEXT:    vprolq $15, %zmm0, %zmm0
   2039 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2040 ; AVX512F-NEXT:    vzeroupper
   2041 ; AVX512F-NEXT:    retq
   2042 ;
   2043 ; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64:
   2044 ; AVX512VL:       # %bb.0:
   2045 ; AVX512VL-NEXT:    vprolq $15, %xmm0, %xmm0
   2046 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2047 ; AVX512VL-NEXT:    retq
   2048 ;
   2049 ; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64:
   2050 ; AVX512BW:       # %bb.0:
   2051 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   2052 ; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
   2053 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2054 ; AVX512BW-NEXT:    vzeroupper
   2055 ; AVX512BW-NEXT:    retq
   2056 ;
   2057 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v2i64:
   2058 ; AVX512VLBW:       # %bb.0:
   2059 ; AVX512VLBW-NEXT:    vprolq $15, %xmm0, %xmm0
   2060 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2061 ; AVX512VLBW-NEXT:    retq
   2062 ;
   2063 ; XOP-LABEL: splatconstant_rotate_mask_v2i64:
   2064 ; XOP:       # %bb.0:
   2065 ; XOP-NEXT:    vprotq $15, %xmm0, %xmm0
   2066 ; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2067 ; XOP-NEXT:    retq
   2068 ;
   2069 ; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
   2070 ; X32-SSE:       # %bb.0:
   2071 ; X32-SSE-NEXT:    psrlq $49, %xmm0
   2072 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   2073 ; X32-SSE-NEXT:    retl
   2074   %shl = shl <2 x i64> %a, <i64 15, i64 15>
   2075   %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
   2076   %rmask = and <2 x i64> %lshr, <i64 255, i64 127>
   2077   %lmask = and <2 x i64> %shl, <i64 65, i64 33>
   2078   %or = or <2 x i64> %lmask, %rmask
   2079   ret <2 x i64> %or
   2080 }
   2081 
   2082 define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
   2083 ; SSE-LABEL: splatconstant_rotate_mask_v4i32:
   2084 ; SSE:       # %bb.0:
   2085 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   2086 ; SSE-NEXT:    psrld $28, %xmm1
   2087 ; SSE-NEXT:    pslld $4, %xmm0
   2088 ; SSE-NEXT:    por %xmm1, %xmm0
   2089 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   2090 ; SSE-NEXT:    retq
   2091 ;
   2092 ; AVX-LABEL: splatconstant_rotate_mask_v4i32:
   2093 ; AVX:       # %bb.0:
   2094 ; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
   2095 ; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
   2096 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
   2097 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2098 ; AVX-NEXT:    retq
   2099 ;
   2100 ; AVX512F-LABEL: splatconstant_rotate_mask_v4i32:
   2101 ; AVX512F:       # %bb.0:
   2102 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   2103 ; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
   2104 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2105 ; AVX512F-NEXT:    vzeroupper
   2106 ; AVX512F-NEXT:    retq
   2107 ;
   2108 ; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32:
   2109 ; AVX512VL:       # %bb.0:
   2110 ; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
   2111 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2112 ; AVX512VL-NEXT:    retq
   2113 ;
   2114 ; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32:
   2115 ; AVX512BW:       # %bb.0:
   2116 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   2117 ; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
   2118 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2119 ; AVX512BW-NEXT:    vzeroupper
   2120 ; AVX512BW-NEXT:    retq
   2121 ;
   2122 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i32:
   2123 ; AVX512VLBW:       # %bb.0:
   2124 ; AVX512VLBW-NEXT:    vprold $4, %xmm0, %xmm0
   2125 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2126 ; AVX512VLBW-NEXT:    retq
   2127 ;
   2128 ; XOP-LABEL: splatconstant_rotate_mask_v4i32:
   2129 ; XOP:       # %bb.0:
   2130 ; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
   2131 ; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2132 ; XOP-NEXT:    retq
   2133 ;
   2134 ; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
   2135 ; X32-SSE:       # %bb.0:
   2136 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   2137 ; X32-SSE-NEXT:    psrld $28, %xmm1
   2138 ; X32-SSE-NEXT:    pslld $4, %xmm0
   2139 ; X32-SSE-NEXT:    por %xmm1, %xmm0
   2140 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   2141 ; X32-SSE-NEXT:    retl
   2142   %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
   2143   %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
   2144   %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023>
   2145   %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127>
   2146   %or = or <4 x i32> %lmask, %rmask
   2147   ret <4 x i32> %or
   2148 }
   2149 
   2150 define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
   2151 ; SSE-LABEL: splatconstant_rotate_mask_v8i16:
   2152 ; SSE:       # %bb.0:
   2153 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   2154 ; SSE-NEXT:    psrlw $11, %xmm1
   2155 ; SSE-NEXT:    psllw $5, %xmm0
   2156 ; SSE-NEXT:    por %xmm1, %xmm0
   2157 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   2158 ; SSE-NEXT:    retq
   2159 ;
   2160 ; AVX-LABEL: splatconstant_rotate_mask_v8i16:
   2161 ; AVX:       # %bb.0:
   2162 ; AVX-NEXT:    vpsrlw $11, %xmm0, %xmm1
   2163 ; AVX-NEXT:    vpsllw $5, %xmm0, %xmm0
   2164 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
   2165 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2166 ; AVX-NEXT:    retq
   2167 ;
   2168 ; AVX512-LABEL: splatconstant_rotate_mask_v8i16:
   2169 ; AVX512:       # %bb.0:
   2170 ; AVX512-NEXT:    vpsrlw $11, %xmm0, %xmm1
   2171 ; AVX512-NEXT:    vpsllw $5, %xmm0, %xmm0
   2172 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
   2173 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2174 ; AVX512-NEXT:    retq
   2175 ;
   2176 ; XOP-LABEL: splatconstant_rotate_mask_v8i16:
   2177 ; XOP:       # %bb.0:
   2178 ; XOP-NEXT:    vprotw $5, %xmm0, %xmm0
   2179 ; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2180 ; XOP-NEXT:    retq
   2181 ;
   2182 ; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
   2183 ; X32-SSE:       # %bb.0:
   2184 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   2185 ; X32-SSE-NEXT:    psrlw $11, %xmm1
   2186 ; X32-SSE-NEXT:    psllw $5, %xmm0
   2187 ; X32-SSE-NEXT:    por %xmm1, %xmm0
   2188 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   2189 ; X32-SSE-NEXT:    retl
   2190   %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
   2191   %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   2192   %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
   2193   %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
   2194   %or = or <8 x i16> %lmask, %rmask
   2195   ret <8 x i16> %or
   2196 }
   2197 
   2198 define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
   2199 ; SSE-LABEL: splatconstant_rotate_mask_v16i8:
   2200 ; SSE:       # %bb.0:
   2201 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   2202 ; SSE-NEXT:    psrlw $4, %xmm1
   2203 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
   2204 ; SSE-NEXT:    psllw $4, %xmm0
   2205 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   2206 ; SSE-NEXT:    por %xmm1, %xmm0
   2207 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   2208 ; SSE-NEXT:    retq
   2209 ;
   2210 ; AVX-LABEL: splatconstant_rotate_mask_v16i8:
   2211 ; AVX:       # %bb.0:
   2212 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
   2213 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   2214 ; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
   2215 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2216 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
   2217 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2218 ; AVX-NEXT:    retq
   2219 ;
   2220 ; AVX512-LABEL: splatconstant_rotate_mask_v16i8:
   2221 ; AVX512:       # %bb.0:
   2222 ; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
   2223 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
   2224 ; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
   2225 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2226 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
   2227 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2228 ; AVX512-NEXT:    retq
   2229 ;
   2230 ; XOP-LABEL: splatconstant_rotate_mask_v16i8:
   2231 ; XOP:       # %bb.0:
   2232 ; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
   2233 ; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   2234 ; XOP-NEXT:    retq
   2235 ;
   2236 ; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
   2237 ; X32-SSE:       # %bb.0:
   2238 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
   2239 ; X32-SSE-NEXT:    psrlw $4, %xmm1
   2240 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
   2241 ; X32-SSE-NEXT:    psllw $4, %xmm0
   2242 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   2243 ; X32-SSE-NEXT:    por %xmm1, %xmm0
   2244 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
   2245 ; X32-SSE-NEXT:    retl
   2246   %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   2247   %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   2248   %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
   2249   %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
   2250   %or = or <16 x i8> %lmask, %rmask
   2251   ret <16 x i8> %or
   2252 }
   2253