Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512F
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512BW
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP
      8 
      9 ; fold (sdiv x, 1) -> x
     10 define i32 @combine_sdiv_by_one(i32 %x) {
     11 ; CHECK-LABEL: combine_sdiv_by_one:
     12 ; CHECK:       # %bb.0:
     13 ; CHECK-NEXT:    movl %edi, %eax
     14 ; CHECK-NEXT:    retq
     15   %1 = sdiv i32 %x, 1
     16   ret i32 %1
     17 }
     18 
     19 define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
     20 ; CHECK-LABEL: combine_vec_sdiv_by_one:
     21 ; CHECK:       # %bb.0:
     22 ; CHECK-NEXT:    retq
     23   %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
     24   ret <4 x i32> %1
     25 }
     26 
     27 ; fold (sdiv x, -1) -> 0 - x
     28 define i32 @combine_sdiv_by_negone(i32 %x) {
     29 ; CHECK-LABEL: combine_sdiv_by_negone:
     30 ; CHECK:       # %bb.0:
     31 ; CHECK-NEXT:    negl %edi
     32 ; CHECK-NEXT:    movl %edi, %eax
     33 ; CHECK-NEXT:    retq
     34   %1 = sdiv i32 %x, -1
     35   ret i32 %1
     36 }
     37 
     38 define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
     39 ; SSE-LABEL: combine_vec_sdiv_by_negone:
     40 ; SSE:       # %bb.0:
     41 ; SSE-NEXT:    pxor %xmm1, %xmm1
     42 ; SSE-NEXT:    psubd %xmm0, %xmm1
     43 ; SSE-NEXT:    movdqa %xmm1, %xmm0
     44 ; SSE-NEXT:    retq
     45 ;
     46 ; AVX-LABEL: combine_vec_sdiv_by_negone:
     47 ; AVX:       # %bb.0:
     48 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     49 ; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
     50 ; AVX-NEXT:    retq
     51   %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
     52   ret <4 x i32> %1
     53 }
     54 
     55 ; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0)
     56 define i32 @combine_sdiv_by_minsigned(i32 %x) {
     57 ; CHECK-LABEL: combine_sdiv_by_minsigned:
     58 ; CHECK:       # %bb.0:
     59 ; CHECK-NEXT:    xorl %eax, %eax
     60 ; CHECK-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
     61 ; CHECK-NEXT:    sete %al
     62 ; CHECK-NEXT:    retq
     63   %1 = sdiv i32 %x, -2147483648
     64   ret i32 %1
     65 }
     66 
     67 define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
     68 ; SSE-LABEL: combine_vec_sdiv_by_minsigned:
     69 ; SSE:       # %bb.0:
     70 ; SSE-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
     71 ; SSE-NEXT:    psrld $31, %xmm0
     72 ; SSE-NEXT:    retq
     73 ;
     74 ; AVX1-LABEL: combine_vec_sdiv_by_minsigned:
     75 ; AVX1:       # %bb.0:
     76 ; AVX1-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
     77 ; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
     78 ; AVX1-NEXT:    retq
     79 ;
     80 ; AVX2-LABEL: combine_vec_sdiv_by_minsigned:
     81 ; AVX2:       # %bb.0:
     82 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
     83 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
     84 ; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
     85 ; AVX2-NEXT:    retq
     86 ;
     87 ; AVX512F-LABEL: combine_vec_sdiv_by_minsigned:
     88 ; AVX512F:       # %bb.0:
     89 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
     90 ; AVX512F-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
     91 ; AVX512F-NEXT:    vpsrld $31, %xmm0, %xmm0
     92 ; AVX512F-NEXT:    retq
     93 ;
     94 ; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned:
     95 ; AVX512BW:       # %bb.0:
     96 ; AVX512BW-NEXT:    vpcmpeqd {{.*}}(%rip){1to4}, %xmm0, %k1
     97 ; AVX512BW-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
     98 ; AVX512BW-NEXT:    retq
     99 ;
    100 ; XOP-LABEL: combine_vec_sdiv_by_minsigned:
    101 ; XOP:       # %bb.0:
    102 ; XOP-NEXT:    vpcomeqd {{.*}}(%rip), %xmm0, %xmm0
    103 ; XOP-NEXT:    vpsrld $31, %xmm0, %xmm0
    104 ; XOP-NEXT:    retq
    105   %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
    106   ret <4 x i32> %1
    107 }
    108 
    109 ; TODO fold (sdiv x, x) -> 1
    110 define i32 @combine_sdiv_dupe(i32 %x) {
    111 ; CHECK-LABEL: combine_sdiv_dupe:
    112 ; CHECK:       # %bb.0:
    113 ; CHECK-NEXT:    movl %edi, %eax
    114 ; CHECK-NEXT:    cltd
    115 ; CHECK-NEXT:    idivl %edi
    116 ; CHECK-NEXT:    retq
    117   %1 = sdiv i32 %x, %x
    118   ret i32 %1
    119 }
    120 
    121 define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
    122 ; SSE-LABEL: combine_vec_sdiv_dupe:
    123 ; SSE:       # %bb.0:
    124 ; SSE-NEXT:    pextrd $1, %xmm0, %ecx
    125 ; SSE-NEXT:    movl %ecx, %eax
    126 ; SSE-NEXT:    cltd
    127 ; SSE-NEXT:    idivl %ecx
    128 ; SSE-NEXT:    movl %eax, %ecx
    129 ; SSE-NEXT:    movd %xmm0, %esi
    130 ; SSE-NEXT:    movl %esi, %eax
    131 ; SSE-NEXT:    cltd
    132 ; SSE-NEXT:    idivl %esi
    133 ; SSE-NEXT:    movd %eax, %xmm1
    134 ; SSE-NEXT:    pinsrd $1, %ecx, %xmm1
    135 ; SSE-NEXT:    pextrd $2, %xmm0, %ecx
    136 ; SSE-NEXT:    movl %ecx, %eax
    137 ; SSE-NEXT:    cltd
    138 ; SSE-NEXT:    idivl %ecx
    139 ; SSE-NEXT:    pinsrd $2, %eax, %xmm1
    140 ; SSE-NEXT:    pextrd $3, %xmm0, %ecx
    141 ; SSE-NEXT:    movl %ecx, %eax
    142 ; SSE-NEXT:    cltd
    143 ; SSE-NEXT:    idivl %ecx
    144 ; SSE-NEXT:    pinsrd $3, %eax, %xmm1
    145 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    146 ; SSE-NEXT:    retq
    147 ;
    148 ; AVX-LABEL: combine_vec_sdiv_dupe:
    149 ; AVX:       # %bb.0:
    150 ; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
    151 ; AVX-NEXT:    movl %ecx, %eax
    152 ; AVX-NEXT:    cltd
    153 ; AVX-NEXT:    idivl %ecx
    154 ; AVX-NEXT:    movl %eax, %ecx
    155 ; AVX-NEXT:    vmovd %xmm0, %esi
    156 ; AVX-NEXT:    movl %esi, %eax
    157 ; AVX-NEXT:    cltd
    158 ; AVX-NEXT:    idivl %esi
    159 ; AVX-NEXT:    vmovd %eax, %xmm1
    160 ; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
    161 ; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
    162 ; AVX-NEXT:    movl %ecx, %eax
    163 ; AVX-NEXT:    cltd
    164 ; AVX-NEXT:    idivl %ecx
    165 ; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
    166 ; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
    167 ; AVX-NEXT:    movl %ecx, %eax
    168 ; AVX-NEXT:    cltd
    169 ; AVX-NEXT:    idivl %ecx
    170 ; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
    171 ; AVX-NEXT:    retq
    172   %1 = sdiv <4 x i32> %x, %x
    173   ret <4 x i32> %1
    174 }
    175 
    176 ; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
    177 define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
    178 ; SSE-LABEL: combine_vec_sdiv_by_pos0:
    179 ; SSE:       # %bb.0:
    180 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
    181 ; SSE-NEXT:    psrld $2, %xmm0
    182 ; SSE-NEXT:    retq
    183 ;
    184 ; AVX-LABEL: combine_vec_sdiv_by_pos0:
    185 ; AVX:       # %bb.0:
    186 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    187 ; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
    188 ; AVX-NEXT:    retq
    189   %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
    190   %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
    191   ret <4 x i32> %2
    192 }
    193 
    194 define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
    195 ; SSE-LABEL: combine_vec_sdiv_by_pos1:
    196 ; SSE:       # %bb.0:
    197 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
    198 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    199 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    200 ; SSE-NEXT:    psrld $3, %xmm1
    201 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    202 ; SSE-NEXT:    psrld $4, %xmm0
    203 ; SSE-NEXT:    psrld $2, %xmm2
    204 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
    205 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
    206 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    207 ; SSE-NEXT:    retq
    208 ;
    209 ; AVX1-LABEL: combine_vec_sdiv_by_pos1:
    210 ; AVX1:       # %bb.0:
    211 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    212 ; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm1
    213 ; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm2
    214 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    215 ; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm2
    216 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
    217 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    218 ; AVX1-NEXT:    retq
    219 ;
    220 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1:
    221 ; AVX2ORLATER:       # %bb.0:
    222 ; AVX2ORLATER-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    223 ; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
    224 ; AVX2ORLATER-NEXT:    retq
    225 ;
    226 ; XOP-LABEL: combine_vec_sdiv_by_pos1:
    227 ; XOP:       # %bb.0:
    228 ; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    229 ; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
    230 ; XOP-NEXT:    retq
    231   %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
    232   %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
    233   ret <4 x i32> %2
    234 }
    235 
    236 ; fold (sdiv x, (1 << c)) -> x >>u c
    237 define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
    238 ; SSE-LABEL: combine_vec_sdiv_by_pow2a:
    239 ; SSE:       # %bb.0:
    240 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    241 ; SSE-NEXT:    psrad $31, %xmm1
    242 ; SSE-NEXT:    psrld $30, %xmm1
    243 ; SSE-NEXT:    paddd %xmm0, %xmm1
    244 ; SSE-NEXT:    psrad $2, %xmm1
    245 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    246 ; SSE-NEXT:    retq
    247 ;
    248 ; AVX-LABEL: combine_vec_sdiv_by_pow2a:
    249 ; AVX:       # %bb.0:
    250 ; AVX-NEXT:    vpsrad $31, %xmm0, %xmm1
    251 ; AVX-NEXT:    vpsrld $30, %xmm1, %xmm1
    252 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    253 ; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
    254 ; AVX-NEXT:    retq
    255   %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
    256   ret <4 x i32> %1
    257 }
    258 
    259 define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
    260 ; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg:
    261 ; SSE:       # %bb.0:
    262 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    263 ; SSE-NEXT:    psrad $31, %xmm1
    264 ; SSE-NEXT:    psrld $30, %xmm1
    265 ; SSE-NEXT:    paddd %xmm0, %xmm1
    266 ; SSE-NEXT:    psrad $2, %xmm1
    267 ; SSE-NEXT:    pxor %xmm0, %xmm0
    268 ; SSE-NEXT:    psubd %xmm1, %xmm0
    269 ; SSE-NEXT:    retq
    270 ;
    271 ; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg:
    272 ; AVX:       # %bb.0:
    273 ; AVX-NEXT:    vpsrad $31, %xmm0, %xmm1
    274 ; AVX-NEXT:    vpsrld $30, %xmm1, %xmm1
    275 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    276 ; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
    277 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    278 ; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
    279 ; AVX-NEXT:    retq
    280   %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
    281   ret <4 x i32> %1
    282 }
    283 
    284 define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
    285 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
    286 ; SSE:       # %bb.0:
    287 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    288 ; SSE-NEXT:    pxor %xmm0, %xmm0
    289 ; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
    290 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    291 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,4,2,16,8,32,64,2]
    292 ; SSE-NEXT:    pmullw %xmm2, %xmm3
    293 ; SSE-NEXT:    psrlw $8, %xmm3
    294 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    295 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    296 ; SSE-NEXT:    pmullw %xmm2, %xmm0
    297 ; SSE-NEXT:    psrlw $8, %xmm0
    298 ; SSE-NEXT:    packuswb %xmm0, %xmm3
    299 ; SSE-NEXT:    paddb %xmm1, %xmm3
    300 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
    301 ; SSE-NEXT:    movdqa %xmm2, %xmm4
    302 ; SSE-NEXT:    psraw $4, %xmm4
    303 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [16384,32800,41056,8384,16384,32800,41056,8384]
    304 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
    305 ; SSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
    306 ; SSE-NEXT:    movdqa %xmm2, %xmm4
    307 ; SSE-NEXT:    psraw $2, %xmm4
    308 ; SSE-NEXT:    paddw %xmm0, %xmm0
    309 ; SSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
    310 ; SSE-NEXT:    movdqa %xmm2, %xmm4
    311 ; SSE-NEXT:    psraw $1, %xmm4
    312 ; SSE-NEXT:    paddw %xmm0, %xmm0
    313 ; SSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
    314 ; SSE-NEXT:    psrlw $8, %xmm2
    315 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    316 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    317 ; SSE-NEXT:    psraw $4, %xmm4
    318 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
    319 ; SSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
    320 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    321 ; SSE-NEXT:    psraw $2, %xmm4
    322 ; SSE-NEXT:    paddw %xmm0, %xmm0
    323 ; SSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
    324 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    325 ; SSE-NEXT:    psraw $1, %xmm4
    326 ; SSE-NEXT:    paddw %xmm0, %xmm0
    327 ; SSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
    328 ; SSE-NEXT:    psrlw $8, %xmm3
    329 ; SSE-NEXT:    packuswb %xmm2, %xmm3
    330 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
    331 ; SSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
    332 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    333 ; SSE-NEXT:    retq
    334 ;
    335 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
    336 ; AVX1:       # %bb.0:
    337 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    338 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
    339 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    340 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2]
    341 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
    342 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    343 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    344 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    345 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
    346 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
    347 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
    348 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
    349 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
    350 ; AVX1-NEXT:    vpsraw $4, %xmm2, %xmm3
    351 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16384,32800,41056,8384,16384,32800,41056,8384]
    352 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
    353 ; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
    354 ; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm3
    355 ; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
    356 ; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
    357 ; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm3
    358 ; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
    359 ; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
    360 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    361 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    362 ; AVX1-NEXT:    vpsraw $4, %xmm1, %xmm3
    363 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
    364 ; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
    365 ; AVX1-NEXT:    vpsraw $2, %xmm1, %xmm3
    366 ; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
    367 ; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
    368 ; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm3
    369 ; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
    370 ; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
    371 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
    372 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
    373 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
    374 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    375 ; AVX1-NEXT:    retq
    376 ;
    377 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
    378 ; AVX2:       # %bb.0:
    379 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    380 ; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
    381 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    382 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
    383 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
    384 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
    385 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
    386 ; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
    387 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
    388 ; AVX2-NEXT:    vpsraw $4, %xmm2, %xmm3
    389 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16384,32800,41056,8384,16384,32800,41056,8384]
    390 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
    391 ; AVX2-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
    392 ; AVX2-NEXT:    vpsraw $2, %xmm2, %xmm3
    393 ; AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
    394 ; AVX2-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
    395 ; AVX2-NEXT:    vpsraw $1, %xmm2, %xmm3
    396 ; AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
    397 ; AVX2-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
    398 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
    399 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    400 ; AVX2-NEXT:    vpsraw $4, %xmm1, %xmm3
    401 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
    402 ; AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
    403 ; AVX2-NEXT:    vpsraw $2, %xmm1, %xmm3
    404 ; AVX2-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
    405 ; AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
    406 ; AVX2-NEXT:    vpsraw $1, %xmm1, %xmm3
    407 ; AVX2-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
    408 ; AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
    409 ; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
    410 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
    411 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
    412 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    413 ; AVX2-NEXT:    vzeroupper
    414 ; AVX2-NEXT:    retq
    415 ;
    416 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
    417 ; AVX512F:       # %bb.0:
    418 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    419 ; AVX512F-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
    420 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
    421 ; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
    422 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
    423 ; AVX512F-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
    424 ; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
    425 ; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
    426 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
    427 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
    428 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    429 ; AVX512F-NEXT:    vzeroupper
    430 ; AVX512F-NEXT:    retq
    431 ;
    432 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
    433 ; AVX512BW:       # %bb.0:
    434 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    435 ; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
    436 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    437 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
    438 ; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
    439 ; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
    440 ; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
    441 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %ymm1, %ymm1
    442 ; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
    443 ; AVX512BW-NEXT:    movw $257, %ax # imm = 0x101
    444 ; AVX512BW-NEXT:    kmovd %eax, %k1
    445 ; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
    446 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
    447 ; AVX512BW-NEXT:    vzeroupper
    448 ; AVX512BW-NEXT:    retq
    449 ;
    450 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
    451 ; XOP:       # %bb.0:
    452 ; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    453 ; XOP-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
    454 ; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm1, %xmm1
    455 ; XOP-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
    456 ; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm1, %xmm1
    457 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
    458 ; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
    459 ; XOP-NEXT:    retq
    460   %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
    461   ret <16 x i8> %1
    462 }
    463 
    464 define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
    465 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
    466 ; SSE:       # %bb.0:
    467 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    468 ; SSE-NEXT:    psraw $15, %xmm1
    469 ; SSE-NEXT:    pmulhuw {{.*}}(%rip), %xmm1
    470 ; SSE-NEXT:    paddw %xmm0, %xmm1
    471 ; SSE-NEXT:    movdqa %xmm1, %xmm2
    472 ; SSE-NEXT:    psraw $4, %xmm2
    473 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7]
    474 ; SSE-NEXT:    movdqa %xmm2, %xmm3
    475 ; SSE-NEXT:    psraw $2, %xmm3
    476 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
    477 ; SSE-NEXT:    movdqa %xmm3, %xmm1
    478 ; SSE-NEXT:    psraw $1, %xmm1
    479 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7]
    480 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
    481 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    482 ; SSE-NEXT:    retq
    483 ;
    484 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
    485 ; AVX1:       # %bb.0:
    486 ; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm1
    487 ; AVX1-NEXT:    vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
    488 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
    489 ; AVX1-NEXT:    vpsraw $4, %xmm1, %xmm2
    490 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7]
    491 ; AVX1-NEXT:    vpsraw $2, %xmm1, %xmm2
    492 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
    493 ; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
    494 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
    495 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
    496 ; AVX1-NEXT:    retq
    497 ;
    498 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
    499 ; AVX2:       # %bb.0:
    500 ; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm1
    501 ; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
    502 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
    503 ; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
    504 ; AVX2-NEXT:    vpsravd {{.*}}(%rip), %ymm1, %ymm1
    505 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
    506 ; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
    507 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
    508 ; AVX2-NEXT:    vzeroupper
    509 ; AVX2-NEXT:    retq
    510 ;
    511 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
    512 ; AVX512F:       # %bb.0:
    513 ; AVX512F-NEXT:    vpsraw $15, %xmm0, %xmm1
    514 ; AVX512F-NEXT:    vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
    515 ; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
    516 ; AVX512F-NEXT:    vpmovsxwd %xmm1, %ymm1
    517 ; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %ymm1, %ymm1
    518 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
    519 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
    520 ; AVX512F-NEXT:    vzeroupper
    521 ; AVX512F-NEXT:    retq
    522 ;
    523 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
    524 ; AVX512BW:       # %bb.0:
    525 ; AVX512BW-NEXT:    vpsraw $15, %xmm0, %xmm1
    526 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %xmm1, %xmm1
    527 ; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
    528 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %xmm1, %xmm1
    529 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
    530 ; AVX512BW-NEXT:    retq
    531 ;
    532 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
    533 ; XOP:       # %bb.0:
    534 ; XOP-NEXT:    vpsraw $15, %xmm0, %xmm1
    535 ; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm1, %xmm1
    536 ; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
    537 ; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm1, %xmm1
    538 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
    539 ; XOP-NEXT:    retq
    540   %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
    541   ret <8 x i16> %1
    542 }
    543 
    544 define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
    545 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
    546 ; SSE:       # %bb.0:
    547 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    548 ; SSE-NEXT:    psraw $15, %xmm2
    549 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2]
    550 ; SSE-NEXT:    pmulhuw %xmm3, %xmm2
    551 ; SSE-NEXT:    paddw %xmm0, %xmm2
    552 ; SSE-NEXT:    movdqa %xmm2, %xmm4
    553 ; SSE-NEXT:    psraw $4, %xmm4
    554 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3],xmm2[4],xmm4[5,6],xmm2[7]
    555 ; SSE-NEXT:    movdqa %xmm4, %xmm5
    556 ; SSE-NEXT:    psraw $2, %xmm5
    557 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
    558 ; SSE-NEXT:    movdqa %xmm5, %xmm2
    559 ; SSE-NEXT:    psraw $1, %xmm2
    560 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7]
    561 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
    562 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    563 ; SSE-NEXT:    psraw $15, %xmm0
    564 ; SSE-NEXT:    pmulhuw %xmm3, %xmm0
    565 ; SSE-NEXT:    paddw %xmm1, %xmm0
    566 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    567 ; SSE-NEXT:    psraw $4, %xmm3
    568 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3],xmm0[4],xmm3[5,6],xmm0[7]
    569 ; SSE-NEXT:    movdqa %xmm3, %xmm0
    570 ; SSE-NEXT:    psraw $2, %xmm0
    571 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
    572 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    573 ; SSE-NEXT:    psraw $1, %xmm3
    574 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6],xmm3[7]
    575 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
    576 ; SSE-NEXT:    movdqa %xmm2, %xmm0
    577 ; SSE-NEXT:    movdqa %xmm3, %xmm1
    578 ; SSE-NEXT:    retq
    579 ;
    580 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
    581 ; AVX1:       # %bb.0:
    582 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    583 ; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm2
    584 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2]
    585 ; AVX1-NEXT:    vpmulhuw %xmm3, %xmm2, %xmm2
    586 ; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    587 ; AVX1-NEXT:    vpsraw $4, %xmm1, %xmm2
    588 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7]
    589 ; AVX1-NEXT:    vpsraw $2, %xmm1, %xmm2
    590 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
    591 ; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
    592 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
    593 ; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm2
    594 ; AVX1-NEXT:    vpmulhuw %xmm3, %xmm2, %xmm2
    595 ; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm2
    596 ; AVX1-NEXT:    vpsraw $4, %xmm2, %xmm3
    597 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7]
    598 ; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm3
    599 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
    600 ; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm3
    601 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
    602 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    603 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
    604 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
    605 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
    606 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
    607 ; AVX1-NEXT:    retq
    608 ;
    609 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
    610 ; AVX2:       # %bb.0:
    611 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    612 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
    613 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
    614 ; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm4
    615 ; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm4, %ymm4
    616 ; AVX2-NEXT:    vpaddw %ymm4, %ymm0, %ymm4
    617 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15]
    618 ; AVX2-NEXT:    vpsravd %ymm3, %ymm5, %ymm3
    619 ; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
    620 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
    621 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
    622 ; AVX2-NEXT:    vpsravd %ymm2, %ymm1, %ymm1
    623 ; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
    624 ; AVX2-NEXT:    vpackusdw %ymm3, %ymm1, %ymm1
    625 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
    626 ; AVX2-NEXT:    retq
    627 ;
    628 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
    629 ; AVX512F:       # %bb.0:
    630 ; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm1
    631 ; AVX512F-NEXT:    vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
    632 ; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
    633 ; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
    634 ; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
    635 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
    636 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
    637 ; AVX512F-NEXT:    retq
    638 ;
    639 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
    640 ; AVX512BW:       # %bb.0:
    641 ; AVX512BW-NEXT:    vpsraw $15, %ymm0, %ymm1
    642 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
    643 ; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
    644 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %ymm1, %ymm1
    645 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
    646 ; AVX512BW-NEXT:    retq
    647 ;
    648 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
    649 ; XOP:       # %bb.0:
    650 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
    651 ; XOP-NEXT:    vpsraw $15, %xmm1, %xmm2
    652 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [65520,65522,65521,65524,65523,65525,65526,65521]
    653 ; XOP-NEXT:    vpshlw %xmm3, %xmm2, %xmm2
    654 ; XOP-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    655 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,65534,65535,65532,65533,65531,65530,65535]
    656 ; XOP-NEXT:    vpshaw %xmm2, %xmm1, %xmm1
    657 ; XOP-NEXT:    vpsraw $15, %xmm0, %xmm4
    658 ; XOP-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
    659 ; XOP-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
    660 ; XOP-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
    661 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    662 ; XOP-NEXT:    vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0
    663 ; XOP-NEXT:    retq
    664   %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
    665   ret <16 x i16> %1
    666 }
    667 
    668 define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
    669 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
    670 ; SSE:       # %bb.0:
    671 ; SSE-NEXT:    movdqa %xmm1, %xmm4
    672 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    673 ; SSE-NEXT:    psraw $15, %xmm0
    674 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,4,2,16,8,32,64,2]
    675 ; SSE-NEXT:    pmulhuw %xmm5, %xmm0
    676 ; SSE-NEXT:    paddw %xmm1, %xmm0
    677 ; SSE-NEXT:    movdqa %xmm0, %xmm6
    678 ; SSE-NEXT:    psraw $4, %xmm6
    679 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3],xmm0[4],xmm6[5,6],xmm0[7]
    680 ; SSE-NEXT:    movdqa %xmm6, %xmm7
    681 ; SSE-NEXT:    psraw $2, %xmm7
    682 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
    683 ; SSE-NEXT:    movdqa %xmm7, %xmm0
    684 ; SSE-NEXT:    psraw $1, %xmm0
    685 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6],xmm0[7]
    686 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
    687 ; SSE-NEXT:    movdqa %xmm4, %xmm1
    688 ; SSE-NEXT:    psraw $15, %xmm1
    689 ; SSE-NEXT:    pmulhuw %xmm5, %xmm1
    690 ; SSE-NEXT:    paddw %xmm4, %xmm1
    691 ; SSE-NEXT:    movdqa %xmm1, %xmm6
    692 ; SSE-NEXT:    psraw $4, %xmm6
    693 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3],xmm1[4],xmm6[5,6],xmm1[7]
    694 ; SSE-NEXT:    movdqa %xmm6, %xmm7
    695 ; SSE-NEXT:    psraw $2, %xmm7
    696 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
    697 ; SSE-NEXT:    movdqa %xmm7, %xmm1
    698 ; SSE-NEXT:    psraw $1, %xmm1
    699 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7]
    700 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
    701 ; SSE-NEXT:    movdqa %xmm2, %xmm4
    702 ; SSE-NEXT:    psraw $15, %xmm4
    703 ; SSE-NEXT:    pmulhuw %xmm5, %xmm4
    704 ; SSE-NEXT:    paddw %xmm2, %xmm4
    705 ; SSE-NEXT:    movdqa %xmm4, %xmm6
    706 ; SSE-NEXT:    psraw $4, %xmm6
    707 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4],xmm6[5,6],xmm4[7]
    708 ; SSE-NEXT:    movdqa %xmm6, %xmm7
    709 ; SSE-NEXT:    psraw $2, %xmm7
    710 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
    711 ; SSE-NEXT:    movdqa %xmm7, %xmm4
    712 ; SSE-NEXT:    psraw $1, %xmm4
    713 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7]
    714 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7]
    715 ; SSE-NEXT:    movdqa %xmm3, %xmm2
    716 ; SSE-NEXT:    psraw $15, %xmm2
    717 ; SSE-NEXT:    pmulhuw %xmm5, %xmm2
    718 ; SSE-NEXT:    paddw %xmm3, %xmm2
    719 ; SSE-NEXT:    movdqa %xmm2, %xmm5
    720 ; SSE-NEXT:    psraw $4, %xmm5
    721 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3],xmm2[4],xmm5[5,6],xmm2[7]
    722 ; SSE-NEXT:    movdqa %xmm5, %xmm2
    723 ; SSE-NEXT:    psraw $2, %xmm2
    724 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5],xmm2[6],xmm5[7]
    725 ; SSE-NEXT:    movdqa %xmm2, %xmm5
    726 ; SSE-NEXT:    psraw $1, %xmm5
    727 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7]
    728 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7]
    729 ; SSE-NEXT:    movdqa %xmm4, %xmm2
    730 ; SSE-NEXT:    movdqa %xmm5, %xmm3
    731 ; SSE-NEXT:    retq
    732 ;
    733 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
    734 ; AVX1:       # %bb.0:
    735 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    736 ; AVX1-NEXT:    vpsraw $15, %xmm3, %xmm4
    737 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,4,2,16,8,32,64,2]
    738 ; AVX1-NEXT:    vpmulhuw %xmm2, %xmm4, %xmm4
    739 ; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
    740 ; AVX1-NEXT:    vpsraw $4, %xmm3, %xmm4
    741 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4],xmm4[5,6],xmm3[7]
    742 ; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm4
    743 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
    744 ; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm4
    745 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
    746 ; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm4
    747 ; AVX1-NEXT:    vpmulhuw %xmm2, %xmm4, %xmm4
    748 ; AVX1-NEXT:    vpaddw %xmm4, %xmm0, %xmm4
    749 ; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
    750 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4],xmm5[5,6],xmm4[7]
    751 ; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
    752 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
    753 ; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
    754 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7]
    755 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
    756 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
    757 ; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
    758 ; AVX1-NEXT:    vandnps %ymm0, %ymm4, %ymm0
    759 ; AVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
    760 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    761 ; AVX1-NEXT:    vpsraw $15, %xmm3, %xmm5
    762 ; AVX1-NEXT:    vpmulhuw %xmm2, %xmm5, %xmm5
    763 ; AVX1-NEXT:    vpaddw %xmm5, %xmm3, %xmm3
    764 ; AVX1-NEXT:    vpsraw $4, %xmm3, %xmm5
    765 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4],xmm5[5,6],xmm3[7]
    766 ; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm5
    767 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
    768 ; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm5
    769 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7]
    770 ; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm5
    771 ; AVX1-NEXT:    vpmulhuw %xmm2, %xmm5, %xmm2
    772 ; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm2
    773 ; AVX1-NEXT:    vpsraw $4, %xmm2, %xmm5
    774 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4],xmm5[5,6],xmm2[7]
    775 ; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm5
    776 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
    777 ; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm5
    778 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7]
    779 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    780 ; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
    781 ; AVX1-NEXT:    vandnps %ymm1, %ymm4, %ymm1
    782 ; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
    783 ; AVX1-NEXT:    retq
    784 ;
    785 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
    786 ; AVX2:       # %bb.0:
    787 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    788 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
    789 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
    790 ; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm5
    791 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2]
    792 ; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
    793 ; AVX2-NEXT:    vpmulhuw %ymm6, %ymm5, %ymm5
    794 ; AVX2-NEXT:    vpaddw %ymm5, %ymm0, %ymm5
    795 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
    796 ; AVX2-NEXT:    vpsravd %ymm4, %ymm7, %ymm7
    797 ; AVX2-NEXT:    vpsrld $16, %ymm7, %ymm7
    798 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
    799 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
    800 ; AVX2-NEXT:    vpsravd %ymm3, %ymm5, %ymm5
    801 ; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
    802 ; AVX2-NEXT:    vpackusdw %ymm7, %ymm5, %ymm5
    803 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15]
    804 ; AVX2-NEXT:    vpsraw $15, %ymm1, %ymm5
    805 ; AVX2-NEXT:    vpmulhuw %ymm6, %ymm5, %ymm5
    806 ; AVX2-NEXT:    vpaddw %ymm5, %ymm1, %ymm5
    807 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
    808 ; AVX2-NEXT:    vpsravd %ymm4, %ymm6, %ymm4
    809 ; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
    810 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
    811 ; AVX2-NEXT:    vpsravd %ymm3, %ymm2, %ymm2
    812 ; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
    813 ; AVX2-NEXT:    vpackusdw %ymm4, %ymm2, %ymm2
    814 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
    815 ; AVX2-NEXT:    retq
    816 ;
    817 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
    818 ; AVX512F:       # %bb.0:
    819 ; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm2
    820 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2]
    821 ; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
    822 ; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm2
    823 ; AVX512F-NEXT:    vpaddw %ymm2, %ymm0, %ymm2
    824 ; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
    825 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
    826 ; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
    827 ; AVX512F-NEXT:    vpsravd %zmm4, %zmm2, %zmm2
    828 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
    829 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
    830 ; AVX512F-NEXT:    vpsraw $15, %ymm1, %ymm2
    831 ; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm2
    832 ; AVX512F-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
    833 ; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
    834 ; AVX512F-NEXT:    vpsravd %zmm4, %zmm2, %zmm2
    835 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
    836 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
    837 ; AVX512F-NEXT:    retq
    838 ;
    839 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
    840 ; AVX512BW:       # %bb.0:
    841 ; AVX512BW-NEXT:    vpsraw $15, %zmm0, %zmm1
    842 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
    843 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
    844 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm1, %zmm1
    845 ; AVX512BW-NEXT:    movl $16843009, %eax # imm = 0x1010101
    846 ; AVX512BW-NEXT:    kmovd %eax, %k1
    847 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
    848 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
    849 ; AVX512BW-NEXT:    retq
    850 ;
    851 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
    852 ; XOP:       # %bb.0:
    853 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
    854 ; XOP-NEXT:    vpsraw $15, %xmm2, %xmm3
    855 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = [65520,65522,65521,65524,65523,65525,65526,65521]
    856 ; XOP-NEXT:    vpshlw %xmm4, %xmm3, %xmm3
    857 ; XOP-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    858 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,65534,65535,65532,65533,65531,65530,65535]
    859 ; XOP-NEXT:    vpshaw %xmm3, %xmm2, %xmm2
    860 ; XOP-NEXT:    vpsraw $15, %xmm0, %xmm5
    861 ; XOP-NEXT:    vpshlw %xmm4, %xmm5, %xmm5
    862 ; XOP-NEXT:    vpaddw %xmm5, %xmm0, %xmm5
    863 ; XOP-NEXT:    vpshaw %xmm3, %xmm5, %xmm5
    864 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
    865 ; XOP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
    866 ; XOP-NEXT:    vpcmov %ymm5, %ymm0, %ymm2, %ymm0
    867 ; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
    868 ; XOP-NEXT:    vpsraw $15, %xmm2, %xmm6
    869 ; XOP-NEXT:    vpshlw %xmm4, %xmm6, %xmm6
    870 ; XOP-NEXT:    vpaddw %xmm6, %xmm2, %xmm2
    871 ; XOP-NEXT:    vpshaw %xmm3, %xmm2, %xmm2
    872 ; XOP-NEXT:    vpsraw $15, %xmm1, %xmm6
    873 ; XOP-NEXT:    vpshlw %xmm4, %xmm6, %xmm4
    874 ; XOP-NEXT:    vpaddw %xmm4, %xmm1, %xmm4
    875 ; XOP-NEXT:    vpshaw %xmm3, %xmm4, %xmm3
    876 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
    877 ; XOP-NEXT:    vpcmov %ymm5, %ymm1, %ymm2, %ymm1
    878 ; XOP-NEXT:    retq
    879   %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
    880   ret <32 x i16> %1
    881 }
    882 
    883 define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
    884 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
    885 ; SSE:       # %bb.0:
    886 ; SSE-NEXT:    movdqa %xmm0, %xmm1
    887 ; SSE-NEXT:    psrad $31, %xmm1
    888 ; SSE-NEXT:    movdqa %xmm1, %xmm2
    889 ; SSE-NEXT:    psrld $28, %xmm2
    890 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    891 ; SSE-NEXT:    psrld $30, %xmm3
    892 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
    893 ; SSE-NEXT:    psrld $29, %xmm1
    894 ; SSE-NEXT:    pxor %xmm2, %xmm2
    895 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    896 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
    897 ; SSE-NEXT:    paddd %xmm0, %xmm2
    898 ; SSE-NEXT:    movdqa %xmm2, %xmm1
    899 ; SSE-NEXT:    movdqa %xmm2, %xmm3
    900 ; SSE-NEXT:    psrad $3, %xmm3
    901 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
    902 ; SSE-NEXT:    psrad $4, %xmm2
    903 ; SSE-NEXT:    psrad $2, %xmm1
    904 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
    905 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
    906 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
    907 ; SSE-NEXT:    retq
    908 ;
    909 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
    910 ; AVX1:       # %bb.0:
    911 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
    912 ; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
    913 ; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
    914 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
    915 ; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
    916 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    917 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
    918 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
    919 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
    920 ; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
    921 ; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
    922 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
    923 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
    924 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
    925 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
    926 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
    927 ; AVX1-NEXT:    retq
    928 ;
    929 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
    930 ; AVX2ORLATER:       # %bb.0:
    931 ; AVX2ORLATER-NEXT:    vpsrad $31, %xmm0, %xmm1
    932 ; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
    933 ; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
    934 ; AVX2ORLATER-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
    935 ; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    936 ; AVX2ORLATER-NEXT:    retq
    937 ;
    938 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
    939 ; XOP:       # %bb.0:
    940 ; XOP-NEXT:    vpsrad $31, %xmm0, %xmm1
    941 ; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
    942 ; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
    943 ; XOP-NEXT:    vpshad {{.*}}(%rip), %xmm1, %xmm1
    944 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
    945 ; XOP-NEXT:    retq
    946   %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
    947   ret <4 x i32> %1
    948 }
    949 
    950 define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
    951 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
    952 ; SSE:       # %bb.0:
    953 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    954 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    955 ; SSE-NEXT:    psrad $31, %xmm3
    956 ; SSE-NEXT:    movdqa %xmm3, %xmm0
    957 ; SSE-NEXT:    psrld $28, %xmm0
    958 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    959 ; SSE-NEXT:    psrld $30, %xmm4
    960 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5,6,7]
    961 ; SSE-NEXT:    psrld $29, %xmm3
    962 ; SSE-NEXT:    pxor %xmm5, %xmm5
    963 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
    964 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
    965 ; SSE-NEXT:    paddd %xmm2, %xmm3
    966 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    967 ; SSE-NEXT:    movdqa %xmm3, %xmm0
    968 ; SSE-NEXT:    psrad $3, %xmm0
    969 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
    970 ; SSE-NEXT:    psrad $4, %xmm3
    971 ; SSE-NEXT:    psrad $2, %xmm4
    972 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
    973 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
    974 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
    975 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    976 ; SSE-NEXT:    psrad $31, %xmm3
    977 ; SSE-NEXT:    movdqa %xmm3, %xmm2
    978 ; SSE-NEXT:    psrld $28, %xmm2
    979 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    980 ; SSE-NEXT:    psrld $30, %xmm4
    981 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7]
    982 ; SSE-NEXT:    psrld $29, %xmm3
    983 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
    984 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
    985 ; SSE-NEXT:    paddd %xmm1, %xmm3
    986 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    987 ; SSE-NEXT:    movdqa %xmm3, %xmm2
    988 ; SSE-NEXT:    psrad $3, %xmm2
    989 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
    990 ; SSE-NEXT:    psrad $4, %xmm3
    991 ; SSE-NEXT:    psrad $2, %xmm4
    992 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
    993 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
    994 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
    995 ; SSE-NEXT:    movdqa %xmm2, %xmm1
    996 ; SSE-NEXT:    retq
    997 ;
    998 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
    999 ; AVX1:       # %bb.0:
   1000 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1001 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
   1002 ; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
   1003 ; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm4
   1004 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
   1005 ; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm2
   1006 ; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
   1007 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
   1008 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
   1009 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
   1010 ; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
   1011 ; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
   1012 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
   1013 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
   1014 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
   1015 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
   1016 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
   1017 ; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
   1018 ; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm5
   1019 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
   1020 ; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm2
   1021 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
   1022 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
   1023 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
   1024 ; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
   1025 ; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
   1026 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
   1027 ; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm4
   1028 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
   1029 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
   1030 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1031 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
   1032 ; AVX1-NEXT:    retq
   1033 ;
   1034 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
   1035 ; AVX2ORLATER:       # %bb.0:
   1036 ; AVX2ORLATER-NEXT:    vpsrad $31, %ymm0, %ymm1
   1037 ; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
   1038 ; AVX2ORLATER-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
   1039 ; AVX2ORLATER-NEXT:    vpsravd {{.*}}(%rip), %ymm1, %ymm1
   1040 ; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
   1041 ; AVX2ORLATER-NEXT:    retq
   1042 ;
   1043 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
   1044 ; XOP:       # %bb.0:
   1045 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1046 ; XOP-NEXT:    vpsrad $31, %xmm1, %xmm2
   1047 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4294967264,4294967266,4294967267,4294967268]
   1048 ; XOP-NEXT:    vpshld %xmm3, %xmm2, %xmm2
   1049 ; XOP-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
   1050 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292]
   1051 ; XOP-NEXT:    vpshad %xmm2, %xmm1, %xmm1
   1052 ; XOP-NEXT:    vpsrad $31, %xmm0, %xmm4
   1053 ; XOP-NEXT:    vpshld %xmm3, %xmm4, %xmm3
   1054 ; XOP-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
   1055 ; XOP-NEXT:    vpshad %xmm2, %xmm3, %xmm2
   1056 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1057 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
   1058 ; XOP-NEXT:    retq
   1059   %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
   1060   ret <8 x i32> %1
   1061 }
   1062 
   1063 define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
   1064 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
   1065 ; SSE:       # %bb.0:
   1066 ; SSE-NEXT:    movdqa %xmm1, %xmm4
   1067 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1068 ; SSE-NEXT:    movdqa %xmm0, %xmm6
   1069 ; SSE-NEXT:    psrad $31, %xmm6
   1070 ; SSE-NEXT:    movdqa %xmm6, %xmm0
   1071 ; SSE-NEXT:    psrld $28, %xmm0
   1072 ; SSE-NEXT:    movdqa %xmm6, %xmm7
   1073 ; SSE-NEXT:    psrld $30, %xmm7
   1074 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm0[4,5,6,7]
   1075 ; SSE-NEXT:    psrld $29, %xmm6
   1076 ; SSE-NEXT:    pxor %xmm5, %xmm5
   1077 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
   1078 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
   1079 ; SSE-NEXT:    paddd %xmm1, %xmm6
   1080 ; SSE-NEXT:    movdqa %xmm6, %xmm7
   1081 ; SSE-NEXT:    movdqa %xmm6, %xmm0
   1082 ; SSE-NEXT:    psrad $3, %xmm0
   1083 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7]
   1084 ; SSE-NEXT:    psrad $4, %xmm6
   1085 ; SSE-NEXT:    psrad $2, %xmm7
   1086 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
   1087 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
   1088 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
   1089 ; SSE-NEXT:    movdqa %xmm4, %xmm6
   1090 ; SSE-NEXT:    psrad $31, %xmm6
   1091 ; SSE-NEXT:    movdqa %xmm6, %xmm1
   1092 ; SSE-NEXT:    psrld $28, %xmm1
   1093 ; SSE-NEXT:    movdqa %xmm6, %xmm7
   1094 ; SSE-NEXT:    psrld $30, %xmm7
   1095 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7]
   1096 ; SSE-NEXT:    psrld $29, %xmm6
   1097 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
   1098 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
   1099 ; SSE-NEXT:    paddd %xmm4, %xmm6
   1100 ; SSE-NEXT:    movdqa %xmm6, %xmm7
   1101 ; SSE-NEXT:    movdqa %xmm6, %xmm1
   1102 ; SSE-NEXT:    psrad $3, %xmm1
   1103 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
   1104 ; SSE-NEXT:    psrad $4, %xmm6
   1105 ; SSE-NEXT:    psrad $2, %xmm7
   1106 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
   1107 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
   1108 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
   1109 ; SSE-NEXT:    movdqa %xmm2, %xmm6
   1110 ; SSE-NEXT:    psrad $31, %xmm6
   1111 ; SSE-NEXT:    movdqa %xmm6, %xmm4
   1112 ; SSE-NEXT:    psrld $28, %xmm4
   1113 ; SSE-NEXT:    movdqa %xmm6, %xmm7
   1114 ; SSE-NEXT:    psrld $30, %xmm7
   1115 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm4[4,5,6,7]
   1116 ; SSE-NEXT:    psrld $29, %xmm6
   1117 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
   1118 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
   1119 ; SSE-NEXT:    paddd %xmm2, %xmm6
   1120 ; SSE-NEXT:    movdqa %xmm6, %xmm7
   1121 ; SSE-NEXT:    movdqa %xmm6, %xmm4
   1122 ; SSE-NEXT:    psrad $3, %xmm4
   1123 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
   1124 ; SSE-NEXT:    psrad $4, %xmm6
   1125 ; SSE-NEXT:    psrad $2, %xmm7
   1126 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
   1127 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
   1128 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
   1129 ; SSE-NEXT:    movdqa %xmm3, %xmm2
   1130 ; SSE-NEXT:    psrad $31, %xmm2
   1131 ; SSE-NEXT:    movdqa %xmm2, %xmm6
   1132 ; SSE-NEXT:    psrld $28, %xmm6
   1133 ; SSE-NEXT:    movdqa %xmm2, %xmm7
   1134 ; SSE-NEXT:    psrld $30, %xmm7
   1135 ; SSE-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
   1136 ; SSE-NEXT:    psrld $29, %xmm2
   1137 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
   1138 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5],xmm7[6,7]
   1139 ; SSE-NEXT:    paddd %xmm3, %xmm2
   1140 ; SSE-NEXT:    movdqa %xmm2, %xmm6
   1141 ; SSE-NEXT:    movdqa %xmm2, %xmm5
   1142 ; SSE-NEXT:    psrad $3, %xmm5
   1143 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
   1144 ; SSE-NEXT:    psrad $4, %xmm2
   1145 ; SSE-NEXT:    psrad $2, %xmm6
   1146 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
   1147 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
   1148 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
   1149 ; SSE-NEXT:    movdqa %xmm4, %xmm2
   1150 ; SSE-NEXT:    movdqa %xmm5, %xmm3
   1151 ; SSE-NEXT:    retq
   1152 ;
   1153 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
   1154 ; AVX1:       # %bb.0:
   1155 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1156 ; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm2
   1157 ; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm4
   1158 ; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm5
   1159 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
   1160 ; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm5
   1161 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1162 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
   1163 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
   1164 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
   1165 ; AVX1-NEXT:    vpsrad $4, %xmm3, %xmm4
   1166 ; AVX1-NEXT:    vpsrad $2, %xmm3, %xmm5
   1167 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
   1168 ; AVX1-NEXT:    vpsrad $3, %xmm3, %xmm5
   1169 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
   1170 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
   1171 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm4
   1172 ; AVX1-NEXT:    vpsrld $28, %xmm4, %xmm5
   1173 ; AVX1-NEXT:    vpsrld $30, %xmm4, %xmm6
   1174 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
   1175 ; AVX1-NEXT:    vpsrld $29, %xmm4, %xmm4
   1176 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
   1177 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
   1178 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm4
   1179 ; AVX1-NEXT:    vpsrad $4, %xmm4, %xmm5
   1180 ; AVX1-NEXT:    vpsrad $2, %xmm4, %xmm6
   1181 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
   1182 ; AVX1-NEXT:    vpsrad $3, %xmm4, %xmm6
   1183 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
   1184 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
   1185 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
   1186 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7]
   1187 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1188 ; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm4
   1189 ; AVX1-NEXT:    vpsrld $28, %xmm4, %xmm5
   1190 ; AVX1-NEXT:    vpsrld $30, %xmm4, %xmm6
   1191 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
   1192 ; AVX1-NEXT:    vpsrld $29, %xmm4, %xmm4
   1193 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
   1194 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
   1195 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
   1196 ; AVX1-NEXT:    vpsrad $4, %xmm3, %xmm4
   1197 ; AVX1-NEXT:    vpsrad $2, %xmm3, %xmm5
   1198 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
   1199 ; AVX1-NEXT:    vpsrad $3, %xmm3, %xmm5
   1200 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
   1201 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
   1202 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm4
   1203 ; AVX1-NEXT:    vpsrld $28, %xmm4, %xmm5
   1204 ; AVX1-NEXT:    vpsrld $30, %xmm4, %xmm6
   1205 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
   1206 ; AVX1-NEXT:    vpsrld $29, %xmm4, %xmm4
   1207 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
   1208 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
   1209 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm2
   1210 ; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm4
   1211 ; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm5
   1212 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
   1213 ; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm5
   1214 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
   1215 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
   1216 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   1217 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
   1218 ; AVX1-NEXT:    retq
   1219 ;
   1220 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
   1221 ; AVX2:       # %bb.0:
   1222 ; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm2
   1223 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [32,30,29,28,32,30,29,28]
   1224 ; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
   1225 ; AVX2-NEXT:    vpsrlvd %ymm3, %ymm2, %ymm2
   1226 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
   1227 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4]
   1228 ; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
   1229 ; AVX2-NEXT:    vpsravd %ymm4, %ymm2, %ymm2
   1230 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
   1231 ; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm2
   1232 ; AVX2-NEXT:    vpsrlvd %ymm3, %ymm2, %ymm2
   1233 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm2
   1234 ; AVX2-NEXT:    vpsravd %ymm4, %ymm2, %ymm2
   1235 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
   1236 ; AVX2-NEXT:    retq
   1237 ;
   1238 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
   1239 ; AVX512F:       # %bb.0:
   1240 ; AVX512F-NEXT:    vpsrad $31, %zmm0, %zmm1
   1241 ; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
   1242 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
   1243 ; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
   1244 ; AVX512F-NEXT:    movw $4369, %ax # imm = 0x1111
   1245 ; AVX512F-NEXT:    kmovw %eax, %k1
   1246 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   1247 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
   1248 ; AVX512F-NEXT:    retq
   1249 ;
   1250 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
   1251 ; AVX512BW:       # %bb.0:
   1252 ; AVX512BW-NEXT:    vpsrad $31, %zmm0, %zmm1
   1253 ; AVX512BW-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
   1254 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
   1255 ; AVX512BW-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
   1256 ; AVX512BW-NEXT:    movw $4369, %ax # imm = 0x1111
   1257 ; AVX512BW-NEXT:    kmovd %eax, %k1
   1258 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   1259 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
   1260 ; AVX512BW-NEXT:    retq
   1261 ;
   1262 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
   1263 ; XOP:       # %bb.0:
   1264 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1265 ; XOP-NEXT:    vpsrad $31, %xmm2, %xmm3
   1266 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = [4294967264,4294967266,4294967267,4294967268]
   1267 ; XOP-NEXT:    vpshld %xmm4, %xmm3, %xmm3
   1268 ; XOP-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1269 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,4294967294,4294967293,4294967292]
   1270 ; XOP-NEXT:    vpshad %xmm3, %xmm2, %xmm2
   1271 ; XOP-NEXT:    vpsrad $31, %xmm0, %xmm5
   1272 ; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm5
   1273 ; XOP-NEXT:    vpaddd %xmm5, %xmm0, %xmm5
   1274 ; XOP-NEXT:    vpshad %xmm3, %xmm5, %xmm5
   1275 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
   1276 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
   1277 ; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1278 ; XOP-NEXT:    vpsrad $31, %xmm2, %xmm5
   1279 ; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm5
   1280 ; XOP-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
   1281 ; XOP-NEXT:    vpshad %xmm3, %xmm2, %xmm2
   1282 ; XOP-NEXT:    vpsrad $31, %xmm1, %xmm5
   1283 ; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm4
   1284 ; XOP-NEXT:    vpaddd %xmm4, %xmm1, %xmm4
   1285 ; XOP-NEXT:    vpshad %xmm3, %xmm4, %xmm3
   1286 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
   1287 ; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
   1288 ; XOP-NEXT:    retq
   1289   %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
   1290   ret <16 x i32> %1
   1291 }
   1292 
   1293 define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
   1294 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
   1295 ; SSE:       # %bb.0:
   1296 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1297 ; SSE-NEXT:    psrad $31, %xmm1
   1298 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
   1299 ; SSE-NEXT:    psrlq $62, %xmm1
   1300 ; SSE-NEXT:    pxor %xmm2, %xmm2
   1301 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
   1302 ; SSE-NEXT:    paddq %xmm0, %xmm2
   1303 ; SSE-NEXT:    movdqa %xmm2, %xmm1
   1304 ; SSE-NEXT:    psrlq $2, %xmm1
   1305 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
   1306 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
   1307 ; SSE-NEXT:    pxor %xmm2, %xmm1
   1308 ; SSE-NEXT:    psubq %xmm2, %xmm1
   1309 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
   1310 ; SSE-NEXT:    retq
   1311 ;
   1312 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
   1313 ; AVX1:       # %bb.0:
   1314 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1315 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
   1316 ; AVX1-NEXT:    vpsrlq $62, %xmm2, %xmm2
   1317 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
   1318 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
   1319 ; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm2
   1320 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
   1321 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
   1322 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
   1323 ; AVX1-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
   1324 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
   1325 ; AVX1-NEXT:    retq
   1326 ;
   1327 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
   1328 ; AVX2:       # %bb.0:
   1329 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1330 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
   1331 ; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
   1332 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
   1333 ; AVX2-NEXT:    movl $2, %eax
   1334 ; AVX2-NEXT:    vmovq %rax, %xmm2
   1335 ; AVX2-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
   1336 ; AVX2-NEXT:    vpsrlvq %xmm2, %xmm1, %xmm1
   1337 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
   1338 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
   1339 ; AVX2-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
   1340 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   1341 ; AVX2-NEXT:    retq
   1342 ;
   1343 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
   1344 ; AVX512F:       # %bb.0:
   1345 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
   1346 ; AVX512F-NEXT:    movl $2, %eax
   1347 ; AVX512F-NEXT:    vmovq %rax, %xmm1
   1348 ; AVX512F-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   1349 ; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm2
   1350 ; AVX512F-NEXT:    vpsrlvq {{.*}}(%rip), %xmm2, %xmm2
   1351 ; AVX512F-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
   1352 ; AVX512F-NEXT:    vpsravq %zmm1, %zmm2, %zmm1
   1353 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   1354 ; AVX512F-NEXT:    vzeroupper
   1355 ; AVX512F-NEXT:    retq
   1356 ;
   1357 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
   1358 ; AVX512BW:       # %bb.0:
   1359 ; AVX512BW-NEXT:    vpsraq $63, %xmm0, %xmm1
   1360 ; AVX512BW-NEXT:    vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
   1361 ; AVX512BW-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
   1362 ; AVX512BW-NEXT:    movl $2, %eax
   1363 ; AVX512BW-NEXT:    vmovq %rax, %xmm2
   1364 ; AVX512BW-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
   1365 ; AVX512BW-NEXT:    vpsravq %xmm2, %xmm1, %xmm1
   1366 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   1367 ; AVX512BW-NEXT:    retq
   1368 ;
   1369 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
   1370 ; XOP:       # %bb.0:
   1371 ; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm1
   1372 ; XOP-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm1
   1373 ; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
   1374 ; XOP-NEXT:    movq $-2, %rax
   1375 ; XOP-NEXT:    vmovq %rax, %xmm2
   1376 ; XOP-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
   1377 ; XOP-NEXT:    vpshaq %xmm2, %xmm1, %xmm1
   1378 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
   1379 ; XOP-NEXT:    retq
   1380   %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
   1381   ret <2 x i64> %1
   1382 }
   1383 
   1384 define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
   1385 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
   1386 ; SSE:       # %bb.0:
   1387 ; SSE-NEXT:    movdqa %xmm1, %xmm2
   1388 ; SSE-NEXT:    psrad $31, %xmm1
   1389 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
   1390 ; SSE-NEXT:    movdqa %xmm1, %xmm3
   1391 ; SSE-NEXT:    psrlq $60, %xmm3
   1392 ; SSE-NEXT:    psrlq $61, %xmm1
   1393 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
   1394 ; SSE-NEXT:    paddq %xmm2, %xmm1
   1395 ; SSE-NEXT:    movdqa %xmm1, %xmm2
   1396 ; SSE-NEXT:    psrlq $4, %xmm2
   1397 ; SSE-NEXT:    psrlq $3, %xmm1
   1398 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
   1399 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
   1400 ; SSE-NEXT:    pxor %xmm2, %xmm1
   1401 ; SSE-NEXT:    psubq %xmm2, %xmm1
   1402 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   1403 ; SSE-NEXT:    psrad $31, %xmm2
   1404 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
   1405 ; SSE-NEXT:    psrlq $62, %xmm2
   1406 ; SSE-NEXT:    pxor %xmm3, %xmm3
   1407 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
   1408 ; SSE-NEXT:    paddq %xmm0, %xmm3
   1409 ; SSE-NEXT:    movdqa %xmm3, %xmm2
   1410 ; SSE-NEXT:    psrlq $2, %xmm2
   1411 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
   1412 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
   1413 ; SSE-NEXT:    pxor %xmm3, %xmm2
   1414 ; SSE-NEXT:    psubq %xmm3, %xmm2
   1415 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
   1416 ; SSE-NEXT:    retq
   1417 ;
   1418 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
   1419 ; AVX1:       # %bb.0:
   1420 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1421 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1422 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
   1423 ; AVX1-NEXT:    vpsrlq $60, %xmm3, %xmm4
   1424 ; AVX1-NEXT:    vpsrlq $61, %xmm3, %xmm3
   1425 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
   1426 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
   1427 ; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm3
   1428 ; AVX1-NEXT:    vpsrlq $3, %xmm1, %xmm1
   1429 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
   1430 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
   1431 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
   1432 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
   1433 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
   1434 ; AVX1-NEXT:    vpsrlq $62, %xmm3, %xmm3
   1435 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
   1436 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
   1437 ; AVX1-NEXT:    vpsrlq $2, %xmm2, %xmm3
   1438 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
   1439 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
   1440 ; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
   1441 ; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
   1442 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1443 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
   1444 ; AVX1-NEXT:    retq
   1445 ;
   1446 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
   1447 ; AVX2:       # %bb.0:
   1448 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1449 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm1
   1450 ; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
   1451 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
   1452 ; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
   1453 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [9223372036854775808,2305843009213693952,1152921504606846976,576460752303423488]
   1454 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
   1455 ; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
   1456 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
   1457 ; AVX2-NEXT:    retq
   1458 ;
   1459 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
   1460 ; AVX512F:       # %bb.0:
   1461 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1462 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,3,4]
   1463 ; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm2
   1464 ; AVX512F-NEXT:    vpsrlvq {{.*}}(%rip), %ymm2, %ymm2
   1465 ; AVX512F-NEXT:    vpaddq %ymm2, %ymm0, %ymm2
   1466 ; AVX512F-NEXT:    vpsravq %zmm1, %zmm2, %zmm1
   1467 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
   1468 ; AVX512F-NEXT:    retq
   1469 ;
   1470 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
   1471 ; AVX512BW:       # %bb.0:
   1472 ; AVX512BW-NEXT:    vpsraq $63, %ymm0, %ymm1
   1473 ; AVX512BW-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
   1474 ; AVX512BW-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
   1475 ; AVX512BW-NEXT:    vpsravq {{.*}}(%rip), %ymm1, %ymm1
   1476 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
   1477 ; AVX512BW-NEXT:    retq
   1478 ;
   1479 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
   1480 ; XOP:       # %bb.0:
   1481 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
   1482 ; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm2
   1483 ; XOP-NEXT:    vpshlq {{.*}}(%rip), %xmm2, %xmm2
   1484 ; XOP-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
   1485 ; XOP-NEXT:    movq $-2, %rax
   1486 ; XOP-NEXT:    vmovq %rax, %xmm3
   1487 ; XOP-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
   1488 ; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm2
   1489 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1490 ; XOP-NEXT:    vpshaq %xmm1, %xmm3, %xmm1
   1491 ; XOP-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm1
   1492 ; XOP-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
   1493 ; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm1, %xmm1
   1494 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1495 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
   1496 ; XOP-NEXT:    retq
   1497   %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16>
   1498   ret <4 x i64> %1
   1499 }
   1500 
   1501 define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
   1502 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
   1503 ; SSE:       # %bb.0:
   1504 ; SSE-NEXT:    movdqa %xmm3, %xmm4
   1505 ; SSE-NEXT:    movdqa %xmm1, %xmm3
   1506 ; SSE-NEXT:    psrad $31, %xmm1
   1507 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
   1508 ; SSE-NEXT:    movdqa %xmm1, %xmm5
   1509 ; SSE-NEXT:    psrlq $60, %xmm5
   1510 ; SSE-NEXT:    psrlq $61, %xmm1
   1511 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
   1512 ; SSE-NEXT:    paddq %xmm3, %xmm1
   1513 ; SSE-NEXT:    movdqa %xmm1, %xmm3
   1514 ; SSE-NEXT:    psrlq $4, %xmm3
   1515 ; SSE-NEXT:    psrlq $3, %xmm1
   1516 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
   1517 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
   1518 ; SSE-NEXT:    pxor %xmm5, %xmm1
   1519 ; SSE-NEXT:    psubq %xmm5, %xmm1
   1520 ; SSE-NEXT:    movdqa %xmm4, %xmm3
   1521 ; SSE-NEXT:    psrad $31, %xmm3
   1522 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
   1523 ; SSE-NEXT:    movdqa %xmm3, %xmm6
   1524 ; SSE-NEXT:    psrlq $60, %xmm6
   1525 ; SSE-NEXT:    psrlq $61, %xmm3
   1526 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
   1527 ; SSE-NEXT:    paddq %xmm4, %xmm3
   1528 ; SSE-NEXT:    movdqa %xmm3, %xmm4
   1529 ; SSE-NEXT:    psrlq $4, %xmm4
   1530 ; SSE-NEXT:    psrlq $3, %xmm3
   1531 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
   1532 ; SSE-NEXT:    pxor %xmm5, %xmm3
   1533 ; SSE-NEXT:    psubq %xmm5, %xmm3
   1534 ; SSE-NEXT:    movdqa %xmm0, %xmm4
   1535 ; SSE-NEXT:    psrad $31, %xmm4
   1536 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   1537 ; SSE-NEXT:    psrlq $62, %xmm4
   1538 ; SSE-NEXT:    pxor %xmm5, %xmm5
   1539 ; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
   1540 ; SSE-NEXT:    paddq %xmm0, %xmm4
   1541 ; SSE-NEXT:    movdqa %xmm4, %xmm6
   1542 ; SSE-NEXT:    psrlq $2, %xmm6
   1543 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
   1544 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,2305843009213693952]
   1545 ; SSE-NEXT:    pxor %xmm4, %xmm6
   1546 ; SSE-NEXT:    psubq %xmm4, %xmm6
   1547 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7]
   1548 ; SSE-NEXT:    movdqa %xmm2, %xmm6
   1549 ; SSE-NEXT:    psrad $31, %xmm6
   1550 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
   1551 ; SSE-NEXT:    psrlq $62, %xmm6
   1552 ; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
   1553 ; SSE-NEXT:    paddq %xmm2, %xmm6
   1554 ; SSE-NEXT:    movdqa %xmm6, %xmm5
   1555 ; SSE-NEXT:    psrlq $2, %xmm5
   1556 ; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
   1557 ; SSE-NEXT:    pxor %xmm4, %xmm5
   1558 ; SSE-NEXT:    psubq %xmm4, %xmm5
   1559 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
   1560 ; SSE-NEXT:    retq
   1561 ;
   1562 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
   1563 ; AVX1:       # %bb.0:
   1564 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1565 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1566 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm4
   1567 ; AVX1-NEXT:    vpsrlq $60, %xmm4, %xmm5
   1568 ; AVX1-NEXT:    vpsrlq $61, %xmm4, %xmm4
   1569 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
   1570 ; AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
   1571 ; AVX1-NEXT:    vpsrlq $4, %xmm3, %xmm4
   1572 ; AVX1-NEXT:    vpsrlq $3, %xmm3, %xmm3
   1573 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
   1574 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
   1575 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
   1576 ; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
   1577 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
   1578 ; AVX1-NEXT:    vpsrlq $62, %xmm5, %xmm5
   1579 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
   1580 ; AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm5
   1581 ; AVX1-NEXT:    vpsrlq $2, %xmm5, %xmm6
   1582 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
   1583 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952]
   1584 ; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
   1585 ; AVX1-NEXT:    vpsubq %xmm6, %xmm5, %xmm5
   1586 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
   1587 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
   1588 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1589 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm5
   1590 ; AVX1-NEXT:    vpsrlq $60, %xmm5, %xmm7
   1591 ; AVX1-NEXT:    vpsrlq $61, %xmm5, %xmm5
   1592 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
   1593 ; AVX1-NEXT:    vpaddq %xmm5, %xmm3, %xmm3
   1594 ; AVX1-NEXT:    vpsrlq $4, %xmm3, %xmm5
   1595 ; AVX1-NEXT:    vpsrlq $3, %xmm3, %xmm3
   1596 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
   1597 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
   1598 ; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
   1599 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm4
   1600 ; AVX1-NEXT:    vpsrlq $62, %xmm4, %xmm4
   1601 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
   1602 ; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm2
   1603 ; AVX1-NEXT:    vpsrlq $2, %xmm2, %xmm4
   1604 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
   1605 ; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
   1606 ; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
   1607 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   1608 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
   1609 ; AVX1-NEXT:    retq
   1610 ;
   1611 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
   1612 ; AVX2:       # %bb.0:
   1613 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1614 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
   1615 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [64,62,61,60]
   1616 ; AVX2-NEXT:    vpsrlvq %ymm4, %ymm3, %ymm3
   1617 ; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm3
   1618 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,2,3,4]
   1619 ; AVX2-NEXT:    vpsrlvq %ymm5, %ymm3, %ymm3
   1620 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [9223372036854775808,2305843009213693952,1152921504606846976,576460752303423488]
   1621 ; AVX2-NEXT:    vpxor %ymm6, %ymm3, %ymm3
   1622 ; AVX2-NEXT:    vpsubq %ymm6, %ymm3, %ymm3
   1623 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
   1624 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm2
   1625 ; AVX2-NEXT:    vpsrlvq %ymm4, %ymm2, %ymm2
   1626 ; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm2
   1627 ; AVX2-NEXT:    vpsrlvq %ymm5, %ymm2, %ymm2
   1628 ; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
   1629 ; AVX2-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
   1630 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
   1631 ; AVX2-NEXT:    retq
   1632 ;
   1633 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
   1634 ; AVX512F:       # %bb.0:
   1635 ; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm1
   1636 ; AVX512F-NEXT:    vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
   1637 ; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
   1638 ; AVX512F-NEXT:    vpsravq {{.*}}(%rip), %zmm1, %zmm1
   1639 ; AVX512F-NEXT:    movb $17, %al
   1640 ; AVX512F-NEXT:    kmovw %eax, %k1
   1641 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   1642 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
   1643 ; AVX512F-NEXT:    retq
   1644 ;
   1645 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
   1646 ; AVX512BW:       # %bb.0:
   1647 ; AVX512BW-NEXT:    vpsraq $63, %zmm0, %zmm1
   1648 ; AVX512BW-NEXT:    vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
   1649 ; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
   1650 ; AVX512BW-NEXT:    vpsravq {{.*}}(%rip), %zmm1, %zmm1
   1651 ; AVX512BW-NEXT:    movb $17, %al
   1652 ; AVX512BW-NEXT:    kmovd %eax, %k1
   1653 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   1654 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
   1655 ; AVX512BW-NEXT:    retq
   1656 ;
   1657 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
   1658 ; XOP:       # %bb.0:
   1659 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1660 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
   1661 ; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm4
   1662 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm8 = [18446744073709551555,18446744073709551556]
   1663 ; XOP-NEXT:    vpshlq %xmm8, %xmm4, %xmm4
   1664 ; XOP-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
   1665 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612]
   1666 ; XOP-NEXT:    vpshaq %xmm4, %xmm2, %xmm2
   1667 ; XOP-NEXT:    vpshaq %xmm3, %xmm0, %xmm6
   1668 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm7 = [18446744073709551552,18446744073709551554]
   1669 ; XOP-NEXT:    vpshlq %xmm7, %xmm6, %xmm6
   1670 ; XOP-NEXT:    vpaddq %xmm6, %xmm0, %xmm6
   1671 ; XOP-NEXT:    movq $-2, %rax
   1672 ; XOP-NEXT:    vmovq %rax, %xmm5
   1673 ; XOP-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7]
   1674 ; XOP-NEXT:    vpshaq %xmm5, %xmm6, %xmm6
   1675 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm2
   1676 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
   1677 ; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1678 ; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm6
   1679 ; XOP-NEXT:    vpshlq %xmm8, %xmm6, %xmm6
   1680 ; XOP-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
   1681 ; XOP-NEXT:    vpshaq %xmm4, %xmm2, %xmm2
   1682 ; XOP-NEXT:    vpshaq %xmm3, %xmm1, %xmm3
   1683 ; XOP-NEXT:    vpshlq %xmm7, %xmm3, %xmm3
   1684 ; XOP-NEXT:    vpaddq %xmm3, %xmm1, %xmm3
   1685 ; XOP-NEXT:    vpshaq %xmm5, %xmm3, %xmm3
   1686 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
   1687 ; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
   1688 ; XOP-NEXT:    retq
   1689   %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16>
   1690   ret <8 x i64> %1
   1691 }
   1692 
   1693 define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
   1694 ; SSE-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
   1695 ; SSE:       # %bb.0:
   1696 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   1697 ; SSE-NEXT:    psrad $31, %xmm2
   1698 ; SSE-NEXT:    movdqa %xmm2, %xmm1
   1699 ; SSE-NEXT:    psrld $28, %xmm1
   1700 ; SSE-NEXT:    movdqa %xmm2, %xmm3
   1701 ; SSE-NEXT:    psrld $30, %xmm3
   1702 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
   1703 ; SSE-NEXT:    psrld $29, %xmm2
   1704 ; SSE-NEXT:    pxor %xmm4, %xmm4
   1705 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
   1706 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
   1707 ; SSE-NEXT:    paddd %xmm0, %xmm2
   1708 ; SSE-NEXT:    movdqa %xmm2, %xmm3
   1709 ; SSE-NEXT:    movdqa %xmm2, %xmm1
   1710 ; SSE-NEXT:    psrad $3, %xmm1
   1711 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
   1712 ; SSE-NEXT:    psrad $4, %xmm2
   1713 ; SSE-NEXT:    psrad $2, %xmm3
   1714 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
   1715 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
   1716 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1717 ; SSE-NEXT:    psubd %xmm1, %xmm4
   1718 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
   1719 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   1720 ; SSE-NEXT:    retq
   1721 ;
   1722 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
   1723 ; AVX1:       # %bb.0:
   1724 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
   1725 ; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
   1726 ; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
   1727 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
   1728 ; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
   1729 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
   1730 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
   1731 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
   1732 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   1733 ; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
   1734 ; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm4
   1735 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
   1736 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm4
   1737 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
   1738 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
   1739 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1740 ; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm1
   1741 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
   1742 ; AVX1-NEXT:    retq
   1743 ;
   1744 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
   1745 ; AVX2ORLATER:       # %bb.0:
   1746 ; AVX2ORLATER-NEXT:    vpsrad $31, %xmm0, %xmm1
   1747 ; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
   1748 ; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   1749 ; AVX2ORLATER-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
   1750 ; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
   1751 ; AVX2ORLATER-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1752 ; AVX2ORLATER-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
   1753 ; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
   1754 ; AVX2ORLATER-NEXT:    retq
   1755 ;
   1756 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
   1757 ; XOP:       # %bb.0:
   1758 ; XOP-NEXT:    vpsrad $31, %xmm0, %xmm1
   1759 ; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
   1760 ; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   1761 ; XOP-NEXT:    vpshad {{.*}}(%rip), %xmm1, %xmm1
   1762 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
   1763 ; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1764 ; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
   1765 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
   1766 ; XOP-NEXT:    retq
   1767   %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
   1768   ret <4 x i32> %1
   1769 }
   1770 
   1771 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) {
   1772 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1:
   1773 ; CHECK:       # %bb.0:
   1774 ; CHECK-NEXT:    retq
   1775   %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16>
   1776   ret <4 x i32> %1
   1777 }
   1778 
   1779 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) {
   1780 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2:
   1781 ; CHECK:       # %bb.0:
   1782 ; CHECK-NEXT:    retq
   1783   %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16>
   1784   ret <4 x i32> %1
   1785 }
   1786 
   1787 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) {
   1788 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3:
   1789 ; CHECK:       # %bb.0:
   1790 ; CHECK-NEXT:    retq
   1791   %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16>
   1792   ret <4 x i32> %1
   1793 }
   1794 
   1795 ; PR37119
   1796 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
   1797 ; SSE-LABEL: non_splat_minus_one_divisor_0:
   1798 ; SSE:       # %bb.0:
   1799 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1800 ; SSE-NEXT:    pxor %xmm2, %xmm2
   1801 ; SSE-NEXT:    psubb %xmm0, %xmm2
   1802 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   1803 ; SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
   1804 ; SSE-NEXT:    movdqa %xmm2, %xmm0
   1805 ; SSE-NEXT:    retq
   1806 ;
   1807 ; AVX1-LABEL: non_splat_minus_one_divisor_0:
   1808 ; AVX1:       # %bb.0:
   1809 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1810 ; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1811 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   1812 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
   1813 ; AVX1-NEXT:    retq
   1814 ;
   1815 ; AVX2-LABEL: non_splat_minus_one_divisor_0:
   1816 ; AVX2:       # %bb.0:
   1817 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1818 ; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1819 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   1820 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
   1821 ; AVX2-NEXT:    retq
   1822 ;
   1823 ; AVX512F-LABEL: non_splat_minus_one_divisor_0:
   1824 ; AVX512F:       # %bb.0:
   1825 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1826 ; AVX512F-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1827 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   1828 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
   1829 ; AVX512F-NEXT:    retq
   1830 ;
   1831 ; AVX512BW-LABEL: non_splat_minus_one_divisor_0:
   1832 ; AVX512BW:       # %bb.0:
   1833 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1834 ; AVX512BW-NEXT:    movw $443, %ax # imm = 0x1BB
   1835 ; AVX512BW-NEXT:    kmovd %eax, %k1
   1836 ; AVX512BW-NEXT:    vpsubb %xmm0, %xmm1, %xmm0 {%k1}
   1837 ; AVX512BW-NEXT:    retq
   1838 ;
   1839 ; XOP-LABEL: non_splat_minus_one_divisor_0:
   1840 ; XOP:       # %bb.0:
   1841 ; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1842 ; XOP-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1843 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   1844 ; XOP-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
   1845 ; XOP-NEXT:    retq
   1846   %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   1847   ret <16 x i8> %div
   1848 }
   1849 
   1850 define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
   1851 ; SSE-LABEL: non_splat_minus_one_divisor_1:
   1852 ; SSE:       # %bb.0:
   1853 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1854 ; SSE-NEXT:    pxor %xmm2, %xmm2
   1855 ; SSE-NEXT:    pxor %xmm0, %xmm0
   1856 ; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
   1857 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1858 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm4
   1859 ; SSE-NEXT:    psrlw $8, %xmm4
   1860 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1861 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1862 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
   1863 ; SSE-NEXT:    psrlw $8, %xmm0
   1864 ; SSE-NEXT:    packuswb %xmm0, %xmm4
   1865 ; SSE-NEXT:    paddb %xmm1, %xmm4
   1866 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
   1867 ; SSE-NEXT:    movdqa %xmm3, %xmm5
   1868 ; SSE-NEXT:    psraw $4, %xmm5
   1869 ; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [0,32,0,32,8192,8224,57376,57376]
   1870 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
   1871 ; SSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
   1872 ; SSE-NEXT:    movdqa %xmm3, %xmm5
   1873 ; SSE-NEXT:    psraw $2, %xmm5
   1874 ; SSE-NEXT:    paddw %xmm0, %xmm0
   1875 ; SSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
   1876 ; SSE-NEXT:    movdqa %xmm3, %xmm5
   1877 ; SSE-NEXT:    psraw $1, %xmm5
   1878 ; SSE-NEXT:    paddw %xmm0, %xmm0
   1879 ; SSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
   1880 ; SSE-NEXT:    psrlw $8, %xmm3
   1881 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1882 ; SSE-NEXT:    movdqa %xmm4, %xmm5
   1883 ; SSE-NEXT:    psraw $4, %xmm5
   1884 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
   1885 ; SSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm4
   1886 ; SSE-NEXT:    movdqa %xmm4, %xmm5
   1887 ; SSE-NEXT:    psraw $2, %xmm5
   1888 ; SSE-NEXT:    paddw %xmm0, %xmm0
   1889 ; SSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm4
   1890 ; SSE-NEXT:    movdqa %xmm4, %xmm5
   1891 ; SSE-NEXT:    psraw $1, %xmm5
   1892 ; SSE-NEXT:    paddw %xmm0, %xmm0
   1893 ; SSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm4
   1894 ; SSE-NEXT:    psrlw $8, %xmm4
   1895 ; SSE-NEXT:    packuswb %xmm3, %xmm4
   1896 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   1897 ; SSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
   1898 ; SSE-NEXT:    psubb %xmm1, %xmm2
   1899 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
   1900 ; SSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
   1901 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   1902 ; SSE-NEXT:    retq
   1903 ;
   1904 ; AVX1-LABEL: non_splat_minus_one_divisor_1:
   1905 ; AVX1:       # %bb.0:
   1906 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1907 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
   1908 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   1909 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
   1910 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
   1911 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   1912 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   1913 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
   1914 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
   1915 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
   1916 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
   1917 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
   1918 ; AVX1-NEXT:    vpsraw $4, %xmm3, %xmm4
   1919 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,32,0,32,8192,8224,57376,57376]
   1920 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
   1921 ; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
   1922 ; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm4
   1923 ; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
   1924 ; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
   1925 ; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm4
   1926 ; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
   1927 ; AVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
   1928 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
   1929 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
   1930 ; AVX1-NEXT:    vpsraw $4, %xmm2, %xmm4
   1931 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
   1932 ; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
   1933 ; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm4
   1934 ; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
   1935 ; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
   1936 ; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm4
   1937 ; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
   1938 ; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
   1939 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
   1940 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
   1941 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   1942 ; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
   1943 ; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1944 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
   1945 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1946 ; AVX1-NEXT:    retq
   1947 ;
   1948 ; AVX2-LABEL: non_splat_minus_one_divisor_1:
   1949 ; AVX2:       # %bb.0:
   1950 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1951 ; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
   1952 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
   1953 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
   1954 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
   1955 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
   1956 ; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
   1957 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
   1958 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
   1959 ; AVX2-NEXT:    vpsraw $4, %xmm3, %xmm4
   1960 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,32,0,32,8192,8224,57376,57376]
   1961 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
   1962 ; AVX2-NEXT:    vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
   1963 ; AVX2-NEXT:    vpsraw $2, %xmm3, %xmm4
   1964 ; AVX2-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
   1965 ; AVX2-NEXT:    vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
   1966 ; AVX2-NEXT:    vpsraw $1, %xmm3, %xmm4
   1967 ; AVX2-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
   1968 ; AVX2-NEXT:    vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
   1969 ; AVX2-NEXT:    vpsrlw $8, %xmm3, %xmm3
   1970 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
   1971 ; AVX2-NEXT:    vpsraw $4, %xmm2, %xmm4
   1972 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
   1973 ; AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
   1974 ; AVX2-NEXT:    vpsraw $2, %xmm2, %xmm4
   1975 ; AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
   1976 ; AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
   1977 ; AVX2-NEXT:    vpsraw $1, %xmm2, %xmm4
   1978 ; AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
   1979 ; AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
   1980 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
   1981 ; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
   1982 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   1983 ; AVX2-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
   1984 ; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1985 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
   1986 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   1987 ; AVX2-NEXT:    vzeroupper
   1988 ; AVX2-NEXT:    retq
   1989 ;
   1990 ; AVX512F-LABEL: non_splat_minus_one_divisor_1:
   1991 ; AVX512F:       # %bb.0:
   1992 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1993 ; AVX512F-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
   1994 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
   1995 ; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
   1996 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
   1997 ; AVX512F-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
   1998 ; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
   1999 ; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm2, %zmm2
   2000 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
   2001 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   2002 ; AVX512F-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
   2003 ; AVX512F-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   2004 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
   2005 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   2006 ; AVX512F-NEXT:    vzeroupper
   2007 ; AVX512F-NEXT:    retq
   2008 ;
   2009 ; AVX512BW-LABEL: non_splat_minus_one_divisor_1:
   2010 ; AVX512BW:       # %bb.0:
   2011 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   2012 ; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
   2013 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
   2014 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
   2015 ; AVX512BW-NEXT:    vpmovwb %ymm2, %xmm2
   2016 ; AVX512BW-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
   2017 ; AVX512BW-NEXT:    vpmovsxbw %xmm2, %ymm2
   2018 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %ymm2, %ymm2
   2019 ; AVX512BW-NEXT:    vpmovwb %ymm2, %xmm2
   2020 ; AVX512BW-NEXT:    movw $443, %ax # imm = 0x1BB
   2021 ; AVX512BW-NEXT:    kmovd %eax, %k1
   2022 ; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm2 {%k1}
   2023 ; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm0
   2024 ; AVX512BW-NEXT:    movw $24132, %ax # imm = 0x5E44
   2025 ; AVX512BW-NEXT:    kmovd %eax, %k1
   2026 ; AVX512BW-NEXT:    vmovdqu8 %xmm2, %xmm0 {%k1}
   2027 ; AVX512BW-NEXT:    vzeroupper
   2028 ; AVX512BW-NEXT:    retq
   2029 ;
   2030 ; XOP-LABEL: non_splat_minus_one_divisor_1:
   2031 ; XOP:       # %bb.0:
   2032 ; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   2033 ; XOP-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
   2034 ; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm2, %xmm2
   2035 ; XOP-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
   2036 ; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm2, %xmm2
   2037 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
   2038 ; XOP-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
   2039 ; XOP-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   2040 ; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
   2041 ; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
   2042 ; XOP-NEXT:    retq
   2043   %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128>
   2044   ret <16 x i8> %div
   2045 }
   2046 
   2047 define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
   2048 ; SSE-LABEL: non_splat_minus_one_divisor_2:
   2049 ; SSE:       # %bb.0:
   2050 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   2051 ; SSE-NEXT:    psrld $31, %xmm1
   2052 ; SSE-NEXT:    pxor %xmm2, %xmm2
   2053 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
   2054 ; SSE-NEXT:    paddd %xmm0, %xmm1
   2055 ; SSE-NEXT:    psrad $1, %xmm1
   2056 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
   2057 ; SSE-NEXT:    psubd %xmm1, %xmm2
   2058 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
   2059 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   2060 ; SSE-NEXT:    retq
   2061 ;
   2062 ; AVX1-LABEL: non_splat_minus_one_divisor_2:
   2063 ; AVX1:       # %bb.0:
   2064 ; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
   2065 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   2066 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
   2067 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   2068 ; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
   2069 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
   2070 ; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm1
   2071 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
   2072 ; AVX1-NEXT:    retq
   2073 ;
   2074 ; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2:
   2075 ; AVX2ORLATER:       # %bb.0:
   2076 ; AVX2ORLATER-NEXT:    vpsrad $31, %xmm0, %xmm1
   2077 ; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
   2078 ; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   2079 ; AVX2ORLATER-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
   2080 ; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   2081 ; AVX2ORLATER-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   2082 ; AVX2ORLATER-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
   2083 ; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
   2084 ; AVX2ORLATER-NEXT:    retq
   2085 ;
   2086 ; XOP-LABEL: non_splat_minus_one_divisor_2:
   2087 ; XOP:       # %bb.0:
   2088 ; XOP-NEXT:    vpsrad $31, %xmm0, %xmm1
   2089 ; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
   2090 ; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   2091 ; XOP-NEXT:    vpshad {{.*}}(%rip), %xmm1, %xmm1
   2092 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
   2093 ; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   2094 ; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
   2095 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
   2096 ; XOP-NEXT:    retq
   2097   %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2>
   2098   ret <4 x i32> %div
   2099 }
   2100