Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX2NOBW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
      5 
      6 ;
      7 ; udiv by 7
      8 ;
      9 
     10 define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
     11 ; AVX1-LABEL: test_div7_4i64:
     12 ; AVX1:       # %bb.0:
     13 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     14 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
     15 ; AVX1-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
     16 ; AVX1-NEXT:    movq %rcx, %rax
     17 ; AVX1-NEXT:    mulq %rsi
     18 ; AVX1-NEXT:    subq %rdx, %rcx
     19 ; AVX1-NEXT:    shrq %rcx
     20 ; AVX1-NEXT:    addq %rdx, %rcx
     21 ; AVX1-NEXT:    shrq $2, %rcx
     22 ; AVX1-NEXT:    vmovq %rcx, %xmm2
     23 ; AVX1-NEXT:    vmovq %xmm1, %rcx
     24 ; AVX1-NEXT:    movq %rcx, %rax
     25 ; AVX1-NEXT:    mulq %rsi
     26 ; AVX1-NEXT:    subq %rdx, %rcx
     27 ; AVX1-NEXT:    shrq %rcx
     28 ; AVX1-NEXT:    addq %rdx, %rcx
     29 ; AVX1-NEXT:    shrq $2, %rcx
     30 ; AVX1-NEXT:    vmovq %rcx, %xmm1
     31 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
     32 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
     33 ; AVX1-NEXT:    movq %rcx, %rax
     34 ; AVX1-NEXT:    mulq %rsi
     35 ; AVX1-NEXT:    subq %rdx, %rcx
     36 ; AVX1-NEXT:    shrq %rcx
     37 ; AVX1-NEXT:    addq %rdx, %rcx
     38 ; AVX1-NEXT:    shrq $2, %rcx
     39 ; AVX1-NEXT:    vmovq %rcx, %xmm2
     40 ; AVX1-NEXT:    vmovq %xmm0, %rcx
     41 ; AVX1-NEXT:    movq %rcx, %rax
     42 ; AVX1-NEXT:    mulq %rsi
     43 ; AVX1-NEXT:    subq %rdx, %rcx
     44 ; AVX1-NEXT:    shrq %rcx
     45 ; AVX1-NEXT:    addq %rdx, %rcx
     46 ; AVX1-NEXT:    shrq $2, %rcx
     47 ; AVX1-NEXT:    vmovq %rcx, %xmm0
     48 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
     49 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     50 ; AVX1-NEXT:    retq
     51 ;
     52 ; AVX2-LABEL: test_div7_4i64:
     53 ; AVX2:       # %bb.0:
     54 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
     55 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
     56 ; AVX2-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
     57 ; AVX2-NEXT:    movq %rcx, %rax
     58 ; AVX2-NEXT:    mulq %rsi
     59 ; AVX2-NEXT:    subq %rdx, %rcx
     60 ; AVX2-NEXT:    shrq %rcx
     61 ; AVX2-NEXT:    addq %rdx, %rcx
     62 ; AVX2-NEXT:    shrq $2, %rcx
     63 ; AVX2-NEXT:    vmovq %rcx, %xmm2
     64 ; AVX2-NEXT:    vmovq %xmm1, %rcx
     65 ; AVX2-NEXT:    movq %rcx, %rax
     66 ; AVX2-NEXT:    mulq %rsi
     67 ; AVX2-NEXT:    subq %rdx, %rcx
     68 ; AVX2-NEXT:    shrq %rcx
     69 ; AVX2-NEXT:    addq %rdx, %rcx
     70 ; AVX2-NEXT:    shrq $2, %rcx
     71 ; AVX2-NEXT:    vmovq %rcx, %xmm1
     72 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
     73 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
     74 ; AVX2-NEXT:    movq %rcx, %rax
     75 ; AVX2-NEXT:    mulq %rsi
     76 ; AVX2-NEXT:    subq %rdx, %rcx
     77 ; AVX2-NEXT:    shrq %rcx
     78 ; AVX2-NEXT:    addq %rdx, %rcx
     79 ; AVX2-NEXT:    shrq $2, %rcx
     80 ; AVX2-NEXT:    vmovq %rcx, %xmm2
     81 ; AVX2-NEXT:    vmovq %xmm0, %rcx
     82 ; AVX2-NEXT:    movq %rcx, %rax
     83 ; AVX2-NEXT:    mulq %rsi
     84 ; AVX2-NEXT:    subq %rdx, %rcx
     85 ; AVX2-NEXT:    shrq %rcx
     86 ; AVX2-NEXT:    addq %rdx, %rcx
     87 ; AVX2-NEXT:    shrq $2, %rcx
     88 ; AVX2-NEXT:    vmovq %rcx, %xmm0
     89 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
     90 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
     91 ; AVX2-NEXT:    retq
     92   %res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
     93   ret <4 x i64> %res
     94 }
     95 
     96 define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
     97 ; AVX1-LABEL: test_div7_8i32:
     98 ; AVX1:       # %bb.0:
     99 ; AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
    100 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
    101 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
    102 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
    103 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
    104 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
    105 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
    106 ; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm3
    107 ; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
    108 ; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
    109 ; AVX1-NEXT:    vpsrld $2, %xmm2, %xmm2
    110 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
    111 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
    112 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    113 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
    114 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
    115 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
    116 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    117 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
    118 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
    119 ; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
    120 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    121 ; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm0
    122 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    123 ; AVX1-NEXT:    retq
    124 ;
    125 ; AVX2-LABEL: test_div7_8i32:
    126 ; AVX2:       # %bb.0:
    127 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
    128 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
    129 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
    130 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
    131 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
    132 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
    133 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
    134 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
    135 ; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
    136 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    137 ; AVX2-NEXT:    vpsrld $2, %ymm0, %ymm0
    138 ; AVX2-NEXT:    retq
    139   %res = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    140   ret <8 x i32> %res
    141 }
    142 
    143 define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
    144 ; AVX1-LABEL: test_div7_16i16:
    145 ; AVX1:       # %bb.0:
    146 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
    147 ; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
    148 ; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm3
    149 ; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
    150 ; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
    151 ; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
    152 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    153 ; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm1
    154 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
    155 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
    156 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    157 ; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
    158 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    159 ; AVX1-NEXT:    retq
    160 ;
    161 ; AVX2-LABEL: test_div7_16i16:
    162 ; AVX2:       # %bb.0:
    163 ; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
    164 ; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
    165 ; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
    166 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    167 ; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm0
    168 ; AVX2-NEXT:    retq
    169   %res = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
    170   ret <16 x i16> %res
    171 }
    172 
    173 define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
    174 ; AVX1-LABEL: test_div7_32i8:
    175 ; AVX1:       # %bb.0:
    176 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    177 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    178 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
    179 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
    180 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    181 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
    182 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
    183 ; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm4
    184 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
    185 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
    186 ; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
    187 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
    188 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    189 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
    190 ; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    191 ; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
    192 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
    193 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
    194 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    195 ; AVX1-NEXT:    vpmullw %xmm3, %xmm5, %xmm5
    196 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
    197 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
    198 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
    199 ; AVX1-NEXT:    vpmullw %xmm3, %xmm6, %xmm3
    200 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
    201 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
    202 ; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
    203 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
    204 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    205 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    206 ; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
    207 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
    208 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    209 ; AVX1-NEXT:    retq
    210 ;
    211 ; AVX2NOBW-LABEL: test_div7_32i8:
    212 ; AVX2NOBW:       # %bb.0:
    213 ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    214 ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    215 ; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
    216 ; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
    217 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
    218 ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    219 ; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
    220 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
    221 ; AVX2NOBW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
    222 ; AVX2NOBW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
    223 ; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
    224 ; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
    225 ; AVX2NOBW-NEXT:    vpsrlw $1, %ymm0, %ymm0
    226 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
    227 ; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    228 ; AVX2NOBW-NEXT:    vpsrlw $2, %ymm0, %ymm0
    229 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
    230 ; AVX2NOBW-NEXT:    retq
    231 ;
    232 ; AVX512BW-LABEL: test_div7_32i8:
    233 ; AVX512BW:       # %bb.0:
    234 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
    235 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
    236 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
    237 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
    238 ; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
    239 ; AVX512BW-NEXT:    vpsrlw $1, %ymm0, %ymm0
    240 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
    241 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    242 ; AVX512BW-NEXT:    vpsrlw $2, %ymm0, %ymm0
    243 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
    244 ; AVX512BW-NEXT:    retq
    245   %res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
    246   ret <32 x i8> %res
    247 }
    248 
    249 ;
    250 ; urem by 7
    251 ;
    252 
    253 define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
    254 ; AVX1-LABEL: test_rem7_4i64:
    255 ; AVX1:       # %bb.0:
    256 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    257 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
    258 ; AVX1-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
    259 ; AVX1-NEXT:    movq %rcx, %rax
    260 ; AVX1-NEXT:    mulq %rsi
    261 ; AVX1-NEXT:    movq %rcx, %rax
    262 ; AVX1-NEXT:    subq %rdx, %rax
    263 ; AVX1-NEXT:    shrq %rax
    264 ; AVX1-NEXT:    addq %rdx, %rax
    265 ; AVX1-NEXT:    shrq $2, %rax
    266 ; AVX1-NEXT:    leaq (,%rax,8), %rdx
    267 ; AVX1-NEXT:    subq %rdx, %rax
    268 ; AVX1-NEXT:    addq %rcx, %rax
    269 ; AVX1-NEXT:    vmovq %rax, %xmm2
    270 ; AVX1-NEXT:    vmovq %xmm1, %rcx
    271 ; AVX1-NEXT:    movq %rcx, %rax
    272 ; AVX1-NEXT:    mulq %rsi
    273 ; AVX1-NEXT:    movq %rcx, %rax
    274 ; AVX1-NEXT:    subq %rdx, %rax
    275 ; AVX1-NEXT:    shrq %rax
    276 ; AVX1-NEXT:    addq %rdx, %rax
    277 ; AVX1-NEXT:    shrq $2, %rax
    278 ; AVX1-NEXT:    leaq (,%rax,8), %rdx
    279 ; AVX1-NEXT:    subq %rdx, %rax
    280 ; AVX1-NEXT:    addq %rcx, %rax
    281 ; AVX1-NEXT:    vmovq %rax, %xmm1
    282 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    283 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
    284 ; AVX1-NEXT:    movq %rcx, %rax
    285 ; AVX1-NEXT:    mulq %rsi
    286 ; AVX1-NEXT:    movq %rcx, %rax
    287 ; AVX1-NEXT:    subq %rdx, %rax
    288 ; AVX1-NEXT:    shrq %rax
    289 ; AVX1-NEXT:    addq %rdx, %rax
    290 ; AVX1-NEXT:    shrq $2, %rax
    291 ; AVX1-NEXT:    leaq (,%rax,8), %rdx
    292 ; AVX1-NEXT:    subq %rdx, %rax
    293 ; AVX1-NEXT:    addq %rcx, %rax
    294 ; AVX1-NEXT:    vmovq %rax, %xmm2
    295 ; AVX1-NEXT:    vmovq %xmm0, %rcx
    296 ; AVX1-NEXT:    movq %rcx, %rax
    297 ; AVX1-NEXT:    mulq %rsi
    298 ; AVX1-NEXT:    movq %rcx, %rax
    299 ; AVX1-NEXT:    subq %rdx, %rax
    300 ; AVX1-NEXT:    shrq %rax
    301 ; AVX1-NEXT:    addq %rdx, %rax
    302 ; AVX1-NEXT:    shrq $2, %rax
    303 ; AVX1-NEXT:    leaq (,%rax,8), %rdx
    304 ; AVX1-NEXT:    subq %rdx, %rax
    305 ; AVX1-NEXT:    addq %rcx, %rax
    306 ; AVX1-NEXT:    vmovq %rax, %xmm0
    307 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    308 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    309 ; AVX1-NEXT:    retq
    310 ;
    311 ; AVX2-LABEL: test_rem7_4i64:
    312 ; AVX2:       # %bb.0:
    313 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    314 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
    315 ; AVX2-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
    316 ; AVX2-NEXT:    movq %rcx, %rax
    317 ; AVX2-NEXT:    mulq %rsi
    318 ; AVX2-NEXT:    movq %rcx, %rax
    319 ; AVX2-NEXT:    subq %rdx, %rax
    320 ; AVX2-NEXT:    shrq %rax
    321 ; AVX2-NEXT:    addq %rdx, %rax
    322 ; AVX2-NEXT:    shrq $2, %rax
    323 ; AVX2-NEXT:    leaq (,%rax,8), %rdx
    324 ; AVX2-NEXT:    subq %rdx, %rax
    325 ; AVX2-NEXT:    addq %rcx, %rax
    326 ; AVX2-NEXT:    vmovq %rax, %xmm2
    327 ; AVX2-NEXT:    vmovq %xmm1, %rcx
    328 ; AVX2-NEXT:    movq %rcx, %rax
    329 ; AVX2-NEXT:    mulq %rsi
    330 ; AVX2-NEXT:    movq %rcx, %rax
    331 ; AVX2-NEXT:    subq %rdx, %rax
    332 ; AVX2-NEXT:    shrq %rax
    333 ; AVX2-NEXT:    addq %rdx, %rax
    334 ; AVX2-NEXT:    shrq $2, %rax
    335 ; AVX2-NEXT:    leaq (,%rax,8), %rdx
    336 ; AVX2-NEXT:    subq %rdx, %rax
    337 ; AVX2-NEXT:    addq %rcx, %rax
    338 ; AVX2-NEXT:    vmovq %rax, %xmm1
    339 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    340 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
    341 ; AVX2-NEXT:    movq %rcx, %rax
    342 ; AVX2-NEXT:    mulq %rsi
    343 ; AVX2-NEXT:    movq %rcx, %rax
    344 ; AVX2-NEXT:    subq %rdx, %rax
    345 ; AVX2-NEXT:    shrq %rax
    346 ; AVX2-NEXT:    addq %rdx, %rax
    347 ; AVX2-NEXT:    shrq $2, %rax
    348 ; AVX2-NEXT:    leaq (,%rax,8), %rdx
    349 ; AVX2-NEXT:    subq %rdx, %rax
    350 ; AVX2-NEXT:    addq %rcx, %rax
    351 ; AVX2-NEXT:    vmovq %rax, %xmm2
    352 ; AVX2-NEXT:    vmovq %xmm0, %rcx
    353 ; AVX2-NEXT:    movq %rcx, %rax
    354 ; AVX2-NEXT:    mulq %rsi
    355 ; AVX2-NEXT:    movq %rcx, %rax
    356 ; AVX2-NEXT:    subq %rdx, %rax
    357 ; AVX2-NEXT:    shrq %rax
    358 ; AVX2-NEXT:    addq %rdx, %rax
    359 ; AVX2-NEXT:    shrq $2, %rax
    360 ; AVX2-NEXT:    leaq (,%rax,8), %rdx
    361 ; AVX2-NEXT:    subq %rdx, %rax
    362 ; AVX2-NEXT:    addq %rcx, %rax
    363 ; AVX2-NEXT:    vmovq %rax, %xmm0
    364 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    365 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    366 ; AVX2-NEXT:    retq
    367   %res = urem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
    368   ret <4 x i64> %res
    369 }
    370 
    371 define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
    372 ; AVX1-LABEL: test_rem7_8i32:
    373 ; AVX1:       # %bb.0:
    374 ; AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
    375 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    376 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
    377 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
    378 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
    379 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm5, %xmm3
    380 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
    381 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
    382 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
    383 ; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm3
    384 ; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
    385 ; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
    386 ; AVX1-NEXT:    vpsrld $2, %xmm2, %xmm2
    387 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
    388 ; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
    389 ; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
    390 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
    391 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
    392 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm5, %xmm4
    393 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
    394 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    395 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
    396 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm4
    397 ; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
    398 ; AVX1-NEXT:    vpaddd %xmm1, %xmm4, %xmm1
    399 ; AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
    400 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
    401 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
    402 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    403 ; AVX1-NEXT:    retq
    404 ;
    405 ; AVX2-LABEL: test_rem7_8i32:
    406 ; AVX2:       # %bb.0:
    407 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
    408 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
    409 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
    410 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
    411 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
    412 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
    413 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
    414 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
    415 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
    416 ; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
    417 ; AVX2-NEXT:    vpsrld $2, %ymm1, %ymm1
    418 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
    419 ; AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
    420 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
    421 ; AVX2-NEXT:    retq
    422   %res = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    423   ret <8 x i32> %res
    424 }
    425 
    426 define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
    427 ; AVX1-LABEL: test_rem7_16i16:
    428 ; AVX1:       # %bb.0:
    429 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    430 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
    431 ; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
    432 ; AVX1-NEXT:    vpsubw %xmm3, %xmm1, %xmm4
    433 ; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
    434 ; AVX1-NEXT:    vpaddw %xmm3, %xmm4, %xmm3
    435 ; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
    436 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
    437 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
    438 ; AVX1-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
    439 ; AVX1-NEXT:    vpmulhuw %xmm2, %xmm0, %xmm2
    440 ; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm3
    441 ; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
    442 ; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
    443 ; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
    444 ; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
    445 ; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
    446 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    447 ; AVX1-NEXT:    retq
    448 ;
    449 ; AVX2-LABEL: test_rem7_16i16:
    450 ; AVX2:       # %bb.0:
    451 ; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
    452 ; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm2
    453 ; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm2
    454 ; AVX2-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
    455 ; AVX2-NEXT:    vpsrlw $2, %ymm1, %ymm1
    456 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
    457 ; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
    458 ; AVX2-NEXT:    retq
    459   %res = urem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
    460   ret <16 x i16> %res
    461 }
    462 
    463 define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
    464 ; AVX1-LABEL: test_rem7_32i8:
    465 ; AVX1:       # %bb.0:
    466 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    467 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    468 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
    469 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
    470 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    471 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
    472 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
    473 ; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm4
    474 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
    475 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
    476 ; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm4
    477 ; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
    478 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    479 ; AVX1-NEXT:    vpand %xmm8, %xmm4, %xmm4
    480 ; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
    481 ; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
    482 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
    483 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
    484 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
    485 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
    486 ; AVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
    487 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
    488 ; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
    489 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
    490 ; AVX1-NEXT:    vpmullw %xmm7, %xmm2, %xmm2
    491 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
    492 ; AVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2
    493 ; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
    494 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    495 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
    496 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    497 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
    498 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
    499 ; AVX1-NEXT:    vpmullw %xmm3, %xmm6, %xmm3
    500 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
    501 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
    502 ; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm3
    503 ; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
    504 ; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
    505 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
    506 ; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
    507 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
    508 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
    509 ; AVX1-NEXT:    vpmullw %xmm7, %xmm3, %xmm3
    510 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
    511 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
    512 ; AVX1-NEXT:    vpmullw %xmm7, %xmm2, %xmm2
    513 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
    514 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
    515 ; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
    516 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    517 ; AVX1-NEXT:    retq
    518 ;
    519 ; AVX2NOBW-LABEL: test_rem7_32i8:
    520 ; AVX2NOBW:       # %bb.0:
    521 ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    522 ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
    523 ; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
    524 ; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
    525 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
    526 ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    527 ; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
    528 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
    529 ; AVX2NOBW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
    530 ; AVX2NOBW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
    531 ; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
    532 ; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm2
    533 ; AVX2NOBW-NEXT:    vpsrlw $1, %ymm2, %ymm2
    534 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
    535 ; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
    536 ; AVX2NOBW-NEXT:    vpsrlw $2, %ymm1, %ymm1
    537 ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
    538 ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    539 ; AVX2NOBW-NEXT:    vpmovsxbw %xmm2, %ymm2
    540 ; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
    541 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
    542 ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm4
    543 ; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    544 ; AVX2NOBW-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
    545 ; AVX2NOBW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
    546 ; AVX2NOBW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
    547 ; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm1
    548 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
    549 ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm3
    550 ; AVX2NOBW-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
    551 ; AVX2NOBW-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
    552 ; AVX2NOBW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
    553 ; AVX2NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
    554 ; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
    555 ; AVX2NOBW-NEXT:    retq
    556 ;
    557 ; AVX512BW-LABEL: test_rem7_32i8:
    558 ; AVX512BW:       # %bb.0:
    559 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
    560 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
    561 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
    562 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
    563 ; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm2
    564 ; AVX512BW-NEXT:    vpsrlw $1, %ymm2, %ymm2
    565 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
    566 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
    567 ; AVX512BW-NEXT:    vpsrlw $2, %ymm1, %ymm1
    568 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
    569 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
    570 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
    571 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
    572 ; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
    573 ; AVX512BW-NEXT:    retq
    574   %res = urem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
    575   ret <32 x i8> %res
    576 }
    577