Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
      4 
      5 ;
      6 ; sdiv by 7
      7 ;
      8 
      9 define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
     10 ; AVX-LABEL: test_div7_8i64:
     11 ; AVX:       # %bb.0:
     12 ; AVX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
     13 ; AVX-NEXT:    vpextrq $1, %xmm1, %rax
     14 ; AVX-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
     15 ; AVX-NEXT:    imulq %rcx
     16 ; AVX-NEXT:    movq %rdx, %rax
     17 ; AVX-NEXT:    shrq $63, %rax
     18 ; AVX-NEXT:    sarq %rdx
     19 ; AVX-NEXT:    addq %rax, %rdx
     20 ; AVX-NEXT:    vmovq %rdx, %xmm2
     21 ; AVX-NEXT:    vmovq %xmm1, %rax
     22 ; AVX-NEXT:    imulq %rcx
     23 ; AVX-NEXT:    movq %rdx, %rax
     24 ; AVX-NEXT:    shrq $63, %rax
     25 ; AVX-NEXT:    sarq %rdx
     26 ; AVX-NEXT:    addq %rax, %rdx
     27 ; AVX-NEXT:    vmovq %rdx, %xmm1
     28 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
     29 ; AVX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
     30 ; AVX-NEXT:    vpextrq $1, %xmm2, %rax
     31 ; AVX-NEXT:    imulq %rcx
     32 ; AVX-NEXT:    movq %rdx, %rax
     33 ; AVX-NEXT:    shrq $63, %rax
     34 ; AVX-NEXT:    sarq %rdx
     35 ; AVX-NEXT:    addq %rax, %rdx
     36 ; AVX-NEXT:    vmovq %rdx, %xmm3
     37 ; AVX-NEXT:    vmovq %xmm2, %rax
     38 ; AVX-NEXT:    imulq %rcx
     39 ; AVX-NEXT:    movq %rdx, %rax
     40 ; AVX-NEXT:    shrq $63, %rax
     41 ; AVX-NEXT:    sarq %rdx
     42 ; AVX-NEXT:    addq %rax, %rdx
     43 ; AVX-NEXT:    vmovq %rdx, %xmm2
     44 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
     45 ; AVX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
     46 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm2
     47 ; AVX-NEXT:    vpextrq $1, %xmm2, %rax
     48 ; AVX-NEXT:    imulq %rcx
     49 ; AVX-NEXT:    movq %rdx, %rax
     50 ; AVX-NEXT:    shrq $63, %rax
     51 ; AVX-NEXT:    sarq %rdx
     52 ; AVX-NEXT:    addq %rax, %rdx
     53 ; AVX-NEXT:    vmovq %rdx, %xmm3
     54 ; AVX-NEXT:    vmovq %xmm2, %rax
     55 ; AVX-NEXT:    imulq %rcx
     56 ; AVX-NEXT:    movq %rdx, %rax
     57 ; AVX-NEXT:    shrq $63, %rax
     58 ; AVX-NEXT:    sarq %rdx
     59 ; AVX-NEXT:    addq %rax, %rdx
     60 ; AVX-NEXT:    vmovq %rdx, %xmm2
     61 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
     62 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
     63 ; AVX-NEXT:    imulq %rcx
     64 ; AVX-NEXT:    movq %rdx, %rax
     65 ; AVX-NEXT:    shrq $63, %rax
     66 ; AVX-NEXT:    sarq %rdx
     67 ; AVX-NEXT:    addq %rax, %rdx
     68 ; AVX-NEXT:    vmovq %rdx, %xmm3
     69 ; AVX-NEXT:    vmovq %xmm0, %rax
     70 ; AVX-NEXT:    imulq %rcx
     71 ; AVX-NEXT:    movq %rdx, %rax
     72 ; AVX-NEXT:    shrq $63, %rax
     73 ; AVX-NEXT:    sarq %rdx
     74 ; AVX-NEXT:    addq %rax, %rdx
     75 ; AVX-NEXT:    vmovq %rdx, %xmm0
     76 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
     77 ; AVX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
     78 ; AVX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
     79 ; AVX-NEXT:    retq
     80   %res = sdiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
     81   ret <8 x i64> %res
     82 }
     83 
     84 define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
     85 ; AVX-LABEL: test_div7_16i32:
     86 ; AVX:       # %bb.0:
     87 ; AVX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
     88 ; AVX-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2
     89 ; AVX-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
     90 ; AVX-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
     91 ; AVX-NEXT:    vpmuldq %zmm1, %zmm3, %zmm1
     92 ; AVX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
     93 ; AVX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
     94 ; AVX-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
     95 ; AVX-NEXT:    vpsrld $31, %zmm0, %zmm1
     96 ; AVX-NEXT:    vpsrad $2, %zmm0, %zmm0
     97 ; AVX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
     98 ; AVX-NEXT:    retq
     99   %res = sdiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    100   ret <16 x i32> %res
    101 }
    102 
    103 define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
    104 ; AVX512F-LABEL: test_div7_32i16:
    105 ; AVX512F:       # %bb.0:
    106 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
    107 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
    108 ; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm3
    109 ; AVX512F-NEXT:    vpsraw $1, %ymm0, %ymm0
    110 ; AVX512F-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
    111 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm1, %ymm1
    112 ; AVX512F-NEXT:    vpsrlw $15, %ymm1, %ymm2
    113 ; AVX512F-NEXT:    vpsraw $1, %ymm1, %ymm1
    114 ; AVX512F-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
    115 ; AVX512F-NEXT:    retq
    116 ;
    117 ; AVX512BW-LABEL: test_div7_32i16:
    118 ; AVX512BW:       # %bb.0:
    119 ; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %zmm0, %zmm0
    120 ; AVX512BW-NEXT:    vpsrlw $15, %zmm0, %zmm1
    121 ; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm0
    122 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    123 ; AVX512BW-NEXT:    retq
    124   %res = sdiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
    125   ret <32 x i16> %res
    126 }
    127 
    128 define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
    129 ; AVX512F-LABEL: test_div7_64i8:
    130 ; AVX512F:       # %bb.0:
    131 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    132 ; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
    133 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
    134 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
    135 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
    136 ; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm4
    137 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
    138 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
    139 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm2[2,3]
    140 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
    141 ; AVX512F-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
    142 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
    143 ; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
    144 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    145 ; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
    146 ; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm0
    147 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
    148 ; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
    149 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
    150 ; AVX512F-NEXT:    vpxor %ymm6, %ymm0, %ymm0
    151 ; AVX512F-NEXT:    vpsubb %ymm6, %ymm0, %ymm0
    152 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    153 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    154 ; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
    155 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
    156 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
    157 ; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm7
    158 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm7, %ymm3
    159 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
    160 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm2[2,3]
    161 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
    162 ; AVX512F-NEXT:    vpackuswb %ymm7, %ymm2, %ymm2
    163 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
    164 ; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
    165 ; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
    166 ; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm1
    167 ; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
    168 ; AVX512F-NEXT:    vpxor %ymm6, %ymm1, %ymm1
    169 ; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
    170 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    171 ; AVX512F-NEXT:    retq
    172 ;
    173 ; AVX512BW-LABEL: test_div7_64i8:
    174 ; AVX512BW:       # %bb.0:
    175 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1
    176 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
    177 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
    178 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
    179 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
    180 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
    181 ; AVX512BW-NEXT:    vpmovsxbw %ymm3, %zmm3
    182 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
    183 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
    184 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
    185 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
    186 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
    187 ; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
    188 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
    189 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
    190 ; AVX512BW-NEXT:    vpxorq %zmm2, %zmm1, %zmm1
    191 ; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
    192 ; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0
    193 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
    194 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
    195 ; AVX512BW-NEXT:    retq
    196   %res = sdiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
    197   ret <64 x i8> %res
    198 }
    199 
    200 ;
    201 ; srem by 7
    202 ;
    203 
    204 define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
    205 ; AVX-LABEL: test_rem7_8i64:
    206 ; AVX:       # %bb.0:
    207 ; AVX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
    208 ; AVX-NEXT:    vpextrq $1, %xmm1, %rcx
    209 ; AVX-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
    210 ; AVX-NEXT:    movq %rcx, %rax
    211 ; AVX-NEXT:    imulq %rsi
    212 ; AVX-NEXT:    movq %rdx, %rax
    213 ; AVX-NEXT:    shrq $63, %rax
    214 ; AVX-NEXT:    sarq %rdx
    215 ; AVX-NEXT:    addq %rax, %rdx
    216 ; AVX-NEXT:    leaq (,%rdx,8), %rax
    217 ; AVX-NEXT:    subq %rax, %rdx
    218 ; AVX-NEXT:    addq %rcx, %rdx
    219 ; AVX-NEXT:    vmovq %rdx, %xmm2
    220 ; AVX-NEXT:    vmovq %xmm1, %rcx
    221 ; AVX-NEXT:    movq %rcx, %rax
    222 ; AVX-NEXT:    imulq %rsi
    223 ; AVX-NEXT:    movq %rdx, %rax
    224 ; AVX-NEXT:    shrq $63, %rax
    225 ; AVX-NEXT:    sarq %rdx
    226 ; AVX-NEXT:    addq %rax, %rdx
    227 ; AVX-NEXT:    leaq (,%rdx,8), %rax
    228 ; AVX-NEXT:    subq %rax, %rdx
    229 ; AVX-NEXT:    addq %rcx, %rdx
    230 ; AVX-NEXT:    vmovq %rdx, %xmm1
    231 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    232 ; AVX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
    233 ; AVX-NEXT:    vpextrq $1, %xmm2, %rcx
    234 ; AVX-NEXT:    movq %rcx, %rax
    235 ; AVX-NEXT:    imulq %rsi
    236 ; AVX-NEXT:    movq %rdx, %rax
    237 ; AVX-NEXT:    shrq $63, %rax
    238 ; AVX-NEXT:    sarq %rdx
    239 ; AVX-NEXT:    addq %rax, %rdx
    240 ; AVX-NEXT:    leaq (,%rdx,8), %rax
    241 ; AVX-NEXT:    subq %rax, %rdx
    242 ; AVX-NEXT:    addq %rcx, %rdx
    243 ; AVX-NEXT:    vmovq %rdx, %xmm3
    244 ; AVX-NEXT:    vmovq %xmm2, %rcx
    245 ; AVX-NEXT:    movq %rcx, %rax
    246 ; AVX-NEXT:    imulq %rsi
    247 ; AVX-NEXT:    movq %rdx, %rax
    248 ; AVX-NEXT:    shrq $63, %rax
    249 ; AVX-NEXT:    sarq %rdx
    250 ; AVX-NEXT:    addq %rax, %rdx
    251 ; AVX-NEXT:    leaq (,%rdx,8), %rax
    252 ; AVX-NEXT:    subq %rax, %rdx
    253 ; AVX-NEXT:    addq %rcx, %rdx
    254 ; AVX-NEXT:    vmovq %rdx, %xmm2
    255 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
    256 ; AVX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
    257 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm2
    258 ; AVX-NEXT:    vpextrq $1, %xmm2, %rcx
    259 ; AVX-NEXT:    movq %rcx, %rax
    260 ; AVX-NEXT:    imulq %rsi
    261 ; AVX-NEXT:    movq %rdx, %rax
    262 ; AVX-NEXT:    shrq $63, %rax
    263 ; AVX-NEXT:    sarq %rdx
    264 ; AVX-NEXT:    addq %rax, %rdx
    265 ; AVX-NEXT:    leaq (,%rdx,8), %rax
    266 ; AVX-NEXT:    subq %rax, %rdx
    267 ; AVX-NEXT:    addq %rcx, %rdx
    268 ; AVX-NEXT:    vmovq %rdx, %xmm3
    269 ; AVX-NEXT:    vmovq %xmm2, %rcx
    270 ; AVX-NEXT:    movq %rcx, %rax
    271 ; AVX-NEXT:    imulq %rsi
    272 ; AVX-NEXT:    movq %rdx, %rax
    273 ; AVX-NEXT:    shrq $63, %rax
    274 ; AVX-NEXT:    sarq %rdx
    275 ; AVX-NEXT:    addq %rax, %rdx
    276 ; AVX-NEXT:    leaq (,%rdx,8), %rax
    277 ; AVX-NEXT:    subq %rax, %rdx
    278 ; AVX-NEXT:    addq %rcx, %rdx
    279 ; AVX-NEXT:    vmovq %rdx, %xmm2
    280 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
    281 ; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
    282 ; AVX-NEXT:    movq %rcx, %rax
    283 ; AVX-NEXT:    imulq %rsi
    284 ; AVX-NEXT:    movq %rdx, %rax
    285 ; AVX-NEXT:    shrq $63, %rax
    286 ; AVX-NEXT:    sarq %rdx
    287 ; AVX-NEXT:    addq %rax, %rdx
    288 ; AVX-NEXT:    leaq (,%rdx,8), %rax
    289 ; AVX-NEXT:    subq %rax, %rdx
    290 ; AVX-NEXT:    addq %rcx, %rdx
    291 ; AVX-NEXT:    vmovq %rdx, %xmm3
    292 ; AVX-NEXT:    vmovq %xmm0, %rcx
    293 ; AVX-NEXT:    movq %rcx, %rax
    294 ; AVX-NEXT:    imulq %rsi
    295 ; AVX-NEXT:    movq %rdx, %rax
    296 ; AVX-NEXT:    shrq $63, %rax
    297 ; AVX-NEXT:    sarq %rdx
    298 ; AVX-NEXT:    addq %rax, %rdx
    299 ; AVX-NEXT:    leaq (,%rdx,8), %rax
    300 ; AVX-NEXT:    subq %rax, %rdx
    301 ; AVX-NEXT:    addq %rcx, %rdx
    302 ; AVX-NEXT:    vmovq %rdx, %xmm0
    303 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
    304 ; AVX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
    305 ; AVX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
    306 ; AVX-NEXT:    retq
    307   %res = srem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
    308   ret <8 x i64> %res
    309 }
    310 
    311 define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
    312 ; AVX-LABEL: test_rem7_16i32:
    313 ; AVX:       # %bb.0:
    314 ; AVX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
    315 ; AVX-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2
    316 ; AVX-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    317 ; AVX-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    318 ; AVX-NEXT:    vpmuldq %zmm1, %zmm3, %zmm1
    319 ; AVX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
    320 ; AVX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
    321 ; AVX-NEXT:    vpaddd %zmm0, %zmm3, %zmm1
    322 ; AVX-NEXT:    vpsrld $31, %zmm1, %zmm2
    323 ; AVX-NEXT:    vpsrad $2, %zmm1, %zmm1
    324 ; AVX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
    325 ; AVX-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1
    326 ; AVX-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
    327 ; AVX-NEXT:    retq
    328   %res = srem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    329   ret <16 x i32> %res
    330 }
    331 
    332 define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
    333 ; AVX512F-LABEL: test_rem7_32i16:
    334 ; AVX512F:       # %bb.0:
    335 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
    336 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm3
    337 ; AVX512F-NEXT:    vpsrlw $15, %ymm3, %ymm4
    338 ; AVX512F-NEXT:    vpsraw $1, %ymm3, %ymm3
    339 ; AVX512F-NEXT:    vpaddw %ymm4, %ymm3, %ymm3
    340 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
    341 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
    342 ; AVX512F-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
    343 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm1, %ymm2
    344 ; AVX512F-NEXT:    vpsrlw $15, %ymm2, %ymm3
    345 ; AVX512F-NEXT:    vpsraw $1, %ymm2, %ymm2
    346 ; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
    347 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
    348 ; AVX512F-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
    349 ; AVX512F-NEXT:    retq
    350 ;
    351 ; AVX512BW-LABEL: test_rem7_32i16:
    352 ; AVX512BW:       # %bb.0:
    353 ; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %zmm0, %zmm1
    354 ; AVX512BW-NEXT:    vpsrlw $15, %zmm1, %zmm2
    355 ; AVX512BW-NEXT:    vpsraw $1, %zmm1, %zmm1
    356 ; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
    357 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
    358 ; AVX512BW-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
    359 ; AVX512BW-NEXT:    retq
    360   %res = srem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
    361   ret <32 x i16> %res
    362 }
    363 
    364 define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
    365 ; AVX512F-LABEL: test_rem7_64i8:
    366 ; AVX512F:       # %bb.0:
    367 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    368 ; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm3
    369 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
    370 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm3
    371 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
    372 ; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm4
    373 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm4, %ymm4
    374 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
    375 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3]
    376 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
    377 ; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
    378 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm3, %ymm3
    379 ; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm5
    380 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    381 ; AVX512F-NEXT:    vpand %ymm4, %ymm5, %ymm7
    382 ; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
    383 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
    384 ; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
    385 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
    386 ; AVX512F-NEXT:    vpxor %ymm6, %ymm3, %ymm3
    387 ; AVX512F-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
    388 ; AVX512F-NEXT:    vpaddb %ymm7, %ymm3, %ymm7
    389 ; AVX512F-NEXT:    vpmovsxbw %xmm7, %ymm8
    390 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
    391 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm8, %ymm8
    392 ; AVX512F-NEXT:    vpmovsxwd %ymm8, %zmm8
    393 ; AVX512F-NEXT:    vpmovdb %zmm8, %xmm8
    394 ; AVX512F-NEXT:    vextracti128 $1, %ymm7, %xmm7
    395 ; AVX512F-NEXT:    vpmovsxbw %xmm7, %ymm7
    396 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm7, %ymm7
    397 ; AVX512F-NEXT:    vpmovsxwd %ymm7, %zmm7
    398 ; AVX512F-NEXT:    vpmovdb %zmm7, %xmm7
    399 ; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
    400 ; AVX512F-NEXT:    vpsubb %ymm7, %ymm0, %ymm0
    401 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm7
    402 ; AVX512F-NEXT:    vpmovsxbw %xmm7, %ymm7
    403 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm7, %ymm7
    404 ; AVX512F-NEXT:    vpsrlw $8, %ymm7, %ymm7
    405 ; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm8
    406 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm8, %ymm2
    407 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
    408 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm2[2,3],ymm7[2,3]
    409 ; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
    410 ; AVX512F-NEXT:    vpackuswb %ymm8, %ymm2, %ymm2
    411 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm2, %ymm2
    412 ; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm7
    413 ; AVX512F-NEXT:    vpand %ymm4, %ymm7, %ymm4
    414 ; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
    415 ; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
    416 ; AVX512F-NEXT:    vpxor %ymm6, %ymm2, %ymm2
    417 ; AVX512F-NEXT:    vpsubb %ymm6, %ymm2, %ymm2
    418 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm2, %ymm2
    419 ; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm4
    420 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
    421 ; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
    422 ; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
    423 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
    424 ; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
    425 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
    426 ; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
    427 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
    428 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
    429 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
    430 ; AVX512F-NEXT:    retq
    431 ;
    432 ; AVX512BW-LABEL: test_rem7_64i8:
    433 ; AVX512BW:       # %bb.0:
    434 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1
    435 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
    436 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
    437 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
    438 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
    439 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
    440 ; AVX512BW-NEXT:    vpmovsxbw %ymm3, %zmm3
    441 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
    442 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
    443 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
    444 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
    445 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm1
    446 ; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm2
    447 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
    448 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
    449 ; AVX512BW-NEXT:    vpxorq %zmm3, %zmm2, %zmm2
    450 ; AVX512BW-NEXT:    vpsubb %zmm3, %zmm2, %zmm2
    451 ; AVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
    452 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
    453 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
    454 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm2
    455 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
    456 ; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
    457 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
    458 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
    459 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
    460 ; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
    461 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
    462 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
    463 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
    464 ; AVX512BW-NEXT:    retq
    465   %res = srem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
    466   ret <64 x i8> %res
    467 }
    468