Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
      3 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
      4 
      5 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
      6 ; AVX512BW-LABEL: test_pcmpeq_b:
      7 ; AVX512BW:       ## BB#0:
      8 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
      9 ; AVX512BW-NEXT:    kmovq %k0, %rax
     10 ; AVX512BW-NEXT:    retq
     11 ;
     12 ; AVX512F-32-LABEL: test_pcmpeq_b:
     13 ; AVX512F-32:       # BB#0:
     14 ; AVX512F-32-NEXT:    subl $12, %esp
     15 ; AVX512F-32-NEXT:  .Ltmp0:
     16 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
     17 ; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
     18 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
     19 ; AVX512F-32-NEXT:    movl (%esp), %eax
     20 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
     21 ; AVX512F-32-NEXT:    addl $12, %esp
     22 ; AVX512F-32-NEXT:    retl
     23   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
     24   ret i64 %res
     25 }
     26 
     27 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
     28 ; AVX512BW-LABEL: test_mask_pcmpeq_b:
     29 ; AVX512BW:       ## BB#0:
     30 ; AVX512BW-NEXT:    kmovq %rdi, %k1
     31 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
     32 ; AVX512BW-NEXT:    kmovq %k0, %rax
     33 ; AVX512BW-NEXT:    retq
     34 ;
     35 ; AVX512F-32-LABEL: test_mask_pcmpeq_b:
     36 ; AVX512F-32:       # BB#0:
     37 ; AVX512F-32-NEXT:    subl $12, %esp
     38 ; AVX512F-32-NEXT:  .Ltmp1:
     39 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
     40 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
     41 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
     42 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
     43 ; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
     44 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
     45 ; AVX512F-32-NEXT:    movl (%esp), %eax
     46 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
     47 ; AVX512F-32-NEXT:    addl $12, %esp
     48 ; AVX512F-32-NEXT:    retl
     49   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
     50   ret i64 %res
     51 }
     52 
     53 declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
     54 
     55 define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
     56 ; AVX512BW-LABEL: test_pcmpeq_w:
     57 ; AVX512BW:       ## BB#0:
     58 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
     59 ; AVX512BW-NEXT:    kmovd %k0, %eax
     60 ; AVX512BW-NEXT:    retq
     61 ;
     62 ; AVX512F-32-LABEL: test_pcmpeq_w:
     63 ; AVX512F-32:       # BB#0:
     64 ; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
     65 ; AVX512F-32-NEXT:    kmovd %k0, %eax
     66 ; AVX512F-32-NEXT:    retl
     67   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
     68   ret i32 %res
     69 }
     70 
     71 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
     72 ; AVX512BW-LABEL: test_mask_pcmpeq_w:
     73 ; AVX512BW:       ## BB#0:
     74 ; AVX512BW-NEXT:    kmovd %edi, %k1
     75 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
     76 ; AVX512BW-NEXT:    kmovd %k0, %eax
     77 ; AVX512BW-NEXT:    retq
     78 ;
     79 ; AVX512F-32-LABEL: test_mask_pcmpeq_w:
     80 ; AVX512F-32:       # BB#0:
     81 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
     82 ; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
     83 ; AVX512F-32-NEXT:    kmovd %k0, %eax
     84 ; AVX512F-32-NEXT:    retl
     85   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
     86   ret i32 %res
     87 }
     88 
     89 declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
     90 
     91 define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
     92 ; AVX512BW-LABEL: test_pcmpgt_b:
     93 ; AVX512BW:       ## BB#0:
     94 ; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
     95 ; AVX512BW-NEXT:    kmovq %k0, %rax
     96 ; AVX512BW-NEXT:    retq
     97 ;
     98 ; AVX512F-32-LABEL: test_pcmpgt_b:
     99 ; AVX512F-32:       # BB#0:
    100 ; AVX512F-32-NEXT:    subl $12, %esp
    101 ; AVX512F-32-NEXT:  .Ltmp2:
    102 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
    103 ; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
    104 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
    105 ; AVX512F-32-NEXT:    movl (%esp), %eax
    106 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    107 ; AVX512F-32-NEXT:    addl $12, %esp
    108 ; AVX512F-32-NEXT:    retl
    109   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
    110   ret i64 %res
    111 }
    112 
    113 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
    114 ; AVX512BW-LABEL: test_mask_pcmpgt_b:
    115 ; AVX512BW:       ## BB#0:
    116 ; AVX512BW-NEXT:    kmovq %rdi, %k1
    117 ; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
    118 ; AVX512BW-NEXT:    kmovq %k0, %rax
    119 ; AVX512BW-NEXT:    retq
    120 ;
    121 ; AVX512F-32-LABEL: test_mask_pcmpgt_b:
    122 ; AVX512F-32:       # BB#0:
    123 ; AVX512F-32-NEXT:    subl $12, %esp
    124 ; AVX512F-32-NEXT:  .Ltmp3:
    125 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
    126 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
    127 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    128 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
    129 ; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
    130 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
    131 ; AVX512F-32-NEXT:    movl (%esp), %eax
    132 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    133 ; AVX512F-32-NEXT:    addl $12, %esp
    134 ; AVX512F-32-NEXT:    retl
    135   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
    136   ret i64 %res
    137 }
    138 
    139 declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
    140 
    141 define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
    142 ; AVX512BW-LABEL: test_pcmpgt_w:
    143 ; AVX512BW:       ## BB#0:
    144 ; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
    145 ; AVX512BW-NEXT:    kmovd %k0, %eax
    146 ; AVX512BW-NEXT:    retq
    147 ;
    148 ; AVX512F-32-LABEL: test_pcmpgt_w:
    149 ; AVX512F-32:       # BB#0:
    150 ; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
    151 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    152 ; AVX512F-32-NEXT:    retl
    153   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
    154   ret i32 %res
    155 }
    156 
    157 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
    158 ; AVX512BW-LABEL: test_mask_pcmpgt_w:
    159 ; AVX512BW:       ## BB#0:
    160 ; AVX512BW-NEXT:    kmovd %edi, %k1
    161 ; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
    162 ; AVX512BW-NEXT:    kmovd %k0, %eax
    163 ; AVX512BW-NEXT:    retq
    164 ;
    165 ; AVX512F-32-LABEL: test_mask_pcmpgt_w:
    166 ; AVX512F-32:       # BB#0:
    167 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    168 ; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
    169 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    170 ; AVX512F-32-NEXT:    retl
    171   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
    172   ret i32 %res
    173 }
    174 
    175 declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
    176 
    177 define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
    178 ; AVX512BW-LABEL: test_cmp_b_512:
    179 ; AVX512BW:       ## BB#0:
    180 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
    181 ; AVX512BW-NEXT:    kmovq %k0, %rax
    182 ; AVX512BW-NEXT:    vpcmpltb %zmm1, %zmm0, %k0
    183 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    184 ; AVX512BW-NEXT:    addq %rax, %rcx
    185 ; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
    186 ; AVX512BW-NEXT:    kmovq %k0, %rax
    187 ; AVX512BW-NEXT:    addq %rcx, %rax
    188 ; AVX512BW-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0
    189 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    190 ; AVX512BW-NEXT:    addq %rax, %rcx
    191 ; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
    192 ; AVX512BW-NEXT:    kmovq %k0, %rax
    193 ; AVX512BW-NEXT:    addq %rcx, %rax
    194 ; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
    195 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    196 ; AVX512BW-NEXT:    addq %rax, %rcx
    197 ; AVX512BW-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0
    198 ; AVX512BW-NEXT:    kmovq %k0, %rdx
    199 ; AVX512BW-NEXT:    addq %rcx, %rdx
    200 ; AVX512BW-NEXT:    vpcmpordb %zmm1, %zmm0, %k0
    201 ; AVX512BW-NEXT:    kmovq %k0, %rax
    202 ; AVX512BW-NEXT:    addq %rdx, %rax
    203 ; AVX512BW-NEXT:    retq
    204 ;
    205 ; AVX512F-32-LABEL: test_cmp_b_512:
    206 ; AVX512F-32:       # BB#0:
    207 ; AVX512F-32-NEXT:    subl $68, %esp
    208 ; AVX512F-32-NEXT:  .Ltmp4:
    209 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
    210 ; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
    211 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    212 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    213 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    214 ; AVX512F-32-NEXT:    vpcmpltb %zmm1, %zmm0, %k0
    215 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    216 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    217 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    218 ; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
    219 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    220 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    221 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    222 ; AVX512F-32-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0
    223 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    224 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    225 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    226 ; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
    227 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    228 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    229 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    230 ; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
    231 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    232 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    233 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    234 ; AVX512F-32-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0
    235 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    236 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    237 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    238 ; AVX512F-32-NEXT:    vpcmpordb %zmm1, %zmm0, %k0
    239 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
    240 ; AVX512F-32-NEXT:    addl (%esp), %eax
    241 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    242 ; AVX512F-32-NEXT:    addl $68, %esp
    243 ; AVX512F-32-NEXT:    retl
    244   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
    245   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
    246   %ret1 = add i64 %res0, %res1
    247   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
    248   %ret2 = add i64 %ret1, %res2
    249   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
    250   %ret3 = add i64 %ret2, %res3
    251   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
    252   %ret4 = add i64 %ret3, %res4
    253   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
    254   %ret5 = add i64 %ret4, %res5
    255   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
    256   %ret6 = add i64 %ret5, %res6
    257   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
    258   %ret7 = add i64 %ret6, %res7
    259   ret i64 %ret7
    260 }
    261 
    262 define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
    263 ; AVX512BW-LABEL: test_mask_cmp_b_512:
    264 ; AVX512BW:       ## BB#0:
    265 ; AVX512BW-NEXT:    kmovq %rdi, %k1
    266 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
    267 ; AVX512BW-NEXT:    kmovq %k0, %rax
    268 ; AVX512BW-NEXT:    vpcmpltb %zmm1, %zmm0, %k0 {%k1}
    269 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    270 ; AVX512BW-NEXT:    addq %rax, %rcx
    271 ; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
    272 ; AVX512BW-NEXT:    kmovq %k0, %rax
    273 ; AVX512BW-NEXT:    addq %rcx, %rax
    274 ; AVX512BW-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
    275 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    276 ; AVX512BW-NEXT:    addq %rax, %rcx
    277 ; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
    278 ; AVX512BW-NEXT:    kmovq %k0, %rax
    279 ; AVX512BW-NEXT:    addq %rcx, %rax
    280 ; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
    281 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    282 ; AVX512BW-NEXT:    addq %rax, %rcx
    283 ; AVX512BW-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
    284 ; AVX512BW-NEXT:    kmovq %k0, %rdx
    285 ; AVX512BW-NEXT:    addq %rcx, %rdx
    286 ; AVX512BW-NEXT:    vpcmpordb %zmm1, %zmm0, %k0 {%k1}
    287 ; AVX512BW-NEXT:    kmovq %k0, %rax
    288 ; AVX512BW-NEXT:    addq %rdx, %rax
    289 ; AVX512BW-NEXT:    retq
    290 ;
    291 ; AVX512F-32-LABEL: test_mask_cmp_b_512:
    292 ; AVX512F-32:       # BB#0:
    293 ; AVX512F-32-NEXT:    subl $68, %esp
    294 ; AVX512F-32-NEXT:  .Ltmp5:
    295 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
    296 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
    297 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    298 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
    299 ; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
    300 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
    301 ; AVX512F-32-NEXT:    movl (%esp), %eax
    302 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    303 ; AVX512F-32-NEXT:    vpcmpltb %zmm1, %zmm0, %k0 {%k1}
    304 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    305 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    306 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    307 ; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
    308 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    309 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    310 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    311 ; AVX512F-32-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
    312 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    313 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    314 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    315 ; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
    316 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    317 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    318 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    319 ; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
    320 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    321 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    322 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    323 ; AVX512F-32-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
    324 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    325 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    326 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    327 ; AVX512F-32-NEXT:    vpcmpordb %zmm1, %zmm0, %k0 {%k1}
    328 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    329 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    330 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    331 ; AVX512F-32-NEXT:    addl $68, %esp
    332 ; AVX512F-32-NEXT:    retl
    333   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
    334   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
    335   %ret1 = add i64 %res0, %res1
    336   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
    337   %ret2 = add i64 %ret1, %res2
    338   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
    339   %ret3 = add i64 %ret2, %res3
    340   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
    341   %ret4 = add i64 %ret3, %res4
    342   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
    343   %ret5 = add i64 %ret4, %res5
    344   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
    345   %ret6 = add i64 %ret5, %res6
    346   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
    347   %ret7 = add i64 %ret6, %res7
    348   ret i64 %ret7
    349 }
    350 
    351 declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
    352 
    353 define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
    354 ; AVX512BW-LABEL: test_ucmp_b_512:
    355 ; AVX512BW:       ## BB#0:
    356 ; AVX512BW-NEXT:    vpcmpequb %zmm1, %zmm0, %k0
    357 ; AVX512BW-NEXT:    kmovq %k0, %rax
    358 ; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
    359 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    360 ; AVX512BW-NEXT:    addq %rax, %rcx
    361 ; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
    362 ; AVX512BW-NEXT:    kmovq %k0, %rax
    363 ; AVX512BW-NEXT:    addq %rcx, %rax
    364 ; AVX512BW-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0
    365 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    366 ; AVX512BW-NEXT:    addq %rax, %rcx
    367 ; AVX512BW-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0
    368 ; AVX512BW-NEXT:    kmovq %k0, %rax
    369 ; AVX512BW-NEXT:    addq %rcx, %rax
    370 ; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
    371 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    372 ; AVX512BW-NEXT:    addq %rax, %rcx
    373 ; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
    374 ; AVX512BW-NEXT:    kmovq %k0, %rdx
    375 ; AVX512BW-NEXT:    addq %rcx, %rdx
    376 ; AVX512BW-NEXT:    vpcmpordub %zmm1, %zmm0, %k0
    377 ; AVX512BW-NEXT:    kmovq %k0, %rax
    378 ; AVX512BW-NEXT:    addq %rdx, %rax
    379 ; AVX512BW-NEXT:    retq
    380 ;
    381 ; AVX512F-32-LABEL: test_ucmp_b_512:
    382 ; AVX512F-32:       # BB#0:
    383 ; AVX512F-32-NEXT:    subl $68, %esp
    384 ; AVX512F-32-NEXT:  .Ltmp6:
    385 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
    386 ; AVX512F-32-NEXT:    vpcmpequb %zmm1, %zmm0, %k0
    387 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    388 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    389 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    390 ; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
    391 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    392 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    393 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    394 ; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
    395 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    396 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    397 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    398 ; AVX512F-32-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0
    399 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    400 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    401 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    402 ; AVX512F-32-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0
    403 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    404 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    405 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    406 ; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
    407 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    408 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    409 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    410 ; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
    411 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    412 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    413 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    414 ; AVX512F-32-NEXT:    vpcmpordub %zmm1, %zmm0, %k0
    415 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
    416 ; AVX512F-32-NEXT:    addl (%esp), %eax
    417 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    418 ; AVX512F-32-NEXT:    addl $68, %esp
    419 ; AVX512F-32-NEXT:    retl
    420   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
    421   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
    422   %ret1 = add i64 %res0, %res1
    423   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
    424   %ret2 = add i64 %ret1, %res2
    425   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
    426   %ret3 = add i64 %ret2, %res3
    427   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
    428   %ret4 = add i64 %ret3, %res4
    429   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
    430   %ret5 = add i64 %ret4, %res5
    431   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
    432   %ret6 = add i64 %ret5, %res6
    433   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
    434   %ret7 = add i64 %ret6, %res7
    435   ret i64 %ret7
    436 }
    437 
    438 define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
    439 ; AVX512BW-LABEL: test_mask_x86_avx512_ucmp_b_512:
    440 ; AVX512BW:       ## BB#0:
    441 ; AVX512BW-NEXT:    kmovq %rdi, %k1
    442 ; AVX512BW-NEXT:    vpcmpequb %zmm1, %zmm0, %k0 {%k1}
    443 ; AVX512BW-NEXT:    kmovq %k0, %rax
    444 ; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
    445 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    446 ; AVX512BW-NEXT:    addq %rax, %rcx
    447 ; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
    448 ; AVX512BW-NEXT:    kmovq %k0, %rax
    449 ; AVX512BW-NEXT:    addq %rcx, %rax
    450 ; AVX512BW-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
    451 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    452 ; AVX512BW-NEXT:    addq %rax, %rcx
    453 ; AVX512BW-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
    454 ; AVX512BW-NEXT:    kmovq %k0, %rax
    455 ; AVX512BW-NEXT:    addq %rcx, %rax
    456 ; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
    457 ; AVX512BW-NEXT:    kmovq %k0, %rcx
    458 ; AVX512BW-NEXT:    addq %rax, %rcx
    459 ; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
    460 ; AVX512BW-NEXT:    kmovq %k0, %rdx
    461 ; AVX512BW-NEXT:    addq %rcx, %rdx
    462 ; AVX512BW-NEXT:    vpcmpordub %zmm1, %zmm0, %k0 {%k1}
    463 ; AVX512BW-NEXT:    kmovq %k0, %rax
    464 ; AVX512BW-NEXT:    addq %rdx, %rax
    465 ; AVX512BW-NEXT:    retq
    466 ;
    467 ; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
    468 ; AVX512F-32:       # BB#0:
    469 ; AVX512F-32-NEXT:    subl $68, %esp
    470 ; AVX512F-32-NEXT:  .Ltmp7:
    471 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
    472 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
    473 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    474 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
    475 ; AVX512F-32-NEXT:    vpcmpequb %zmm1, %zmm0, %k0 {%k1}
    476 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
    477 ; AVX512F-32-NEXT:    movl (%esp), %eax
    478 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    479 ; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
    480 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    481 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    482 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    483 ; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
    484 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    485 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    486 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    487 ; AVX512F-32-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
    488 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    489 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    490 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    491 ; AVX512F-32-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
    492 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    493 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    494 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    495 ; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
    496 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    497 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    498 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    499 ; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
    500 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    501 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    502 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    503 ; AVX512F-32-NEXT:    vpcmpordub %zmm1, %zmm0, %k0 {%k1}
    504 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
    505 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    506 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
    507 ; AVX512F-32-NEXT:    addl $68, %esp
    508 ; AVX512F-32-NEXT:    retl
    509   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
    510   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
    511   %ret1 = add i64 %res0, %res1
    512   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
    513   %ret2 = add i64 %ret1, %res2
    514   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
    515   %ret3 = add i64 %ret2, %res3
    516   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
    517   %ret4 = add i64 %ret3, %res4
    518   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
    519   %ret5 = add i64 %ret4, %res5
    520   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
    521   %ret6 = add i64 %ret5, %res6
    522   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
    523   %ret7 = add i64 %ret6, %res7
    524   ret i64 %ret7
    525 }
    526 
    527 declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
    528 
    529 define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
    530 ; AVX512BW-LABEL: test_cmp_w_512:
    531 ; AVX512BW:       ## BB#0:
    532 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
    533 ; AVX512BW-NEXT:    kmovd %k0, %eax
    534 ; AVX512BW-NEXT:    vpcmpltw %zmm1, %zmm0, %k0
    535 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    536 ; AVX512BW-NEXT:    addl %eax, %ecx
    537 ; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0
    538 ; AVX512BW-NEXT:    kmovd %k0, %eax
    539 ; AVX512BW-NEXT:    addl %ecx, %eax
    540 ; AVX512BW-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0
    541 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    542 ; AVX512BW-NEXT:    addl %eax, %ecx
    543 ; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
    544 ; AVX512BW-NEXT:    kmovd %k0, %eax
    545 ; AVX512BW-NEXT:    addl %ecx, %eax
    546 ; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
    547 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    548 ; AVX512BW-NEXT:    addl %eax, %ecx
    549 ; AVX512BW-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0
    550 ; AVX512BW-NEXT:    kmovd %k0, %edx
    551 ; AVX512BW-NEXT:    addl %ecx, %edx
    552 ; AVX512BW-NEXT:    vpcmpordw %zmm1, %zmm0, %k0
    553 ; AVX512BW-NEXT:    kmovd %k0, %eax
    554 ; AVX512BW-NEXT:    addl %edx, %eax
    555 ; AVX512BW-NEXT:    retq
    556 ;
    557 ; AVX512F-32-LABEL: test_cmp_w_512:
    558 ; AVX512F-32:       # BB#0:
    559 ; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
    560 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    561 ; AVX512F-32-NEXT:    vpcmpltw %zmm1, %zmm0, %k0
    562 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    563 ; AVX512F-32-NEXT:    addl %eax, %ecx
    564 ; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0
    565 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    566 ; AVX512F-32-NEXT:    addl %ecx, %eax
    567 ; AVX512F-32-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0
    568 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    569 ; AVX512F-32-NEXT:    addl %eax, %ecx
    570 ; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
    571 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    572 ; AVX512F-32-NEXT:    addl %ecx, %eax
    573 ; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
    574 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    575 ; AVX512F-32-NEXT:    addl %eax, %ecx
    576 ; AVX512F-32-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0
    577 ; AVX512F-32-NEXT:    kmovd %k0, %edx
    578 ; AVX512F-32-NEXT:    addl %ecx, %edx
    579 ; AVX512F-32-NEXT:    vpcmpordw %zmm1, %zmm0, %k0
    580 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    581 ; AVX512F-32-NEXT:    addl %edx, %eax
    582 ; AVX512F-32-NEXT:    retl
    583   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
    584   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
    585   %ret1 = add i32 %res0, %res1
    586   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
    587   %ret2 = add i32 %ret1, %res2
    588   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
    589   %ret3 = add i32 %ret2, %res3
    590   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
    591   %ret4 = add i32 %ret3, %res4
    592   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
    593   %ret5 = add i32 %ret4, %res5
    594   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
    595   %ret6 = add i32 %ret5, %res6
    596   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
    597   %ret7 = add i32 %ret6, %res7
    598   ret i32 %ret7
    599 }
    600 
    601 define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
    602 ; AVX512BW-LABEL: test_mask_cmp_w_512:
    603 ; AVX512BW:       ## BB#0:
    604 ; AVX512BW-NEXT:    kmovd %edi, %k1
    605 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
    606 ; AVX512BW-NEXT:    kmovd %k0, %eax
    607 ; AVX512BW-NEXT:    vpcmpltw %zmm1, %zmm0, %k0 {%k1}
    608 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    609 ; AVX512BW-NEXT:    addl %eax, %ecx
    610 ; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
    611 ; AVX512BW-NEXT:    kmovd %k0, %eax
    612 ; AVX512BW-NEXT:    addl %ecx, %eax
    613 ; AVX512BW-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0 {%k1}
    614 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    615 ; AVX512BW-NEXT:    addl %eax, %ecx
    616 ; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
    617 ; AVX512BW-NEXT:    kmovd %k0, %eax
    618 ; AVX512BW-NEXT:    addl %ecx, %eax
    619 ; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
    620 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    621 ; AVX512BW-NEXT:    addl %eax, %ecx
    622 ; AVX512BW-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0 {%k1}
    623 ; AVX512BW-NEXT:    kmovd %k0, %edx
    624 ; AVX512BW-NEXT:    addl %ecx, %edx
    625 ; AVX512BW-NEXT:    vpcmpordw %zmm1, %zmm0, %k0 {%k1}
    626 ; AVX512BW-NEXT:    kmovd %k0, %eax
    627 ; AVX512BW-NEXT:    addl %edx, %eax
    628 ; AVX512BW-NEXT:    retq
    629 ;
    630 ; AVX512F-32-LABEL: test_mask_cmp_w_512:
    631 ; AVX512F-32:       # BB#0:
    632 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    633 ; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
    634 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    635 ; AVX512F-32-NEXT:    vpcmpltw %zmm1, %zmm0, %k0 {%k1}
    636 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    637 ; AVX512F-32-NEXT:    addl %eax, %ecx
    638 ; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
    639 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    640 ; AVX512F-32-NEXT:    addl %ecx, %eax
    641 ; AVX512F-32-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0 {%k1}
    642 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    643 ; AVX512F-32-NEXT:    addl %eax, %ecx
    644 ; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
    645 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    646 ; AVX512F-32-NEXT:    addl %ecx, %eax
    647 ; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
    648 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    649 ; AVX512F-32-NEXT:    addl %eax, %ecx
    650 ; AVX512F-32-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0 {%k1}
    651 ; AVX512F-32-NEXT:    kmovd %k0, %edx
    652 ; AVX512F-32-NEXT:    addl %ecx, %edx
    653 ; AVX512F-32-NEXT:    vpcmpordw %zmm1, %zmm0, %k0 {%k1}
    654 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    655 ; AVX512F-32-NEXT:    addl %edx, %eax
    656 ; AVX512F-32-NEXT:    retl
    657   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
    658   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
    659   %ret1 = add i32 %res0, %res1
    660   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
    661   %ret2 = add i32 %ret1, %res2
    662   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
    663   %ret3 = add i32 %ret2, %res3
    664   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
    665   %ret4 = add i32 %ret3, %res4
    666   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
    667   %ret5 = add i32 %ret4, %res5
    668   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
    669   %ret6 = add i32 %ret5, %res6
    670   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
    671   %ret7 = add i32 %ret6, %res7
    672   ret i32 %ret7
    673 }
    674 
    675 declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
    676 
    677 define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
    678 ; AVX512BW-LABEL: test_ucmp_w_512:
    679 ; AVX512BW:       ## BB#0:
    680 ; AVX512BW-NEXT:    vpcmpequw %zmm1, %zmm0, %k0
    681 ; AVX512BW-NEXT:    kmovd %k0, %eax
    682 ; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
    683 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    684 ; AVX512BW-NEXT:    addl %eax, %ecx
    685 ; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
    686 ; AVX512BW-NEXT:    kmovd %k0, %eax
    687 ; AVX512BW-NEXT:    addl %ecx, %eax
    688 ; AVX512BW-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0
    689 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    690 ; AVX512BW-NEXT:    addl %eax, %ecx
    691 ; AVX512BW-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0
    692 ; AVX512BW-NEXT:    kmovd %k0, %eax
    693 ; AVX512BW-NEXT:    addl %ecx, %eax
    694 ; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
    695 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    696 ; AVX512BW-NEXT:    addl %eax, %ecx
    697 ; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
    698 ; AVX512BW-NEXT:    kmovd %k0, %edx
    699 ; AVX512BW-NEXT:    addl %ecx, %edx
    700 ; AVX512BW-NEXT:    vpcmporduw %zmm1, %zmm0, %k0
    701 ; AVX512BW-NEXT:    kmovd %k0, %eax
    702 ; AVX512BW-NEXT:    addl %edx, %eax
    703 ; AVX512BW-NEXT:    retq
    704 ;
    705 ; AVX512F-32-LABEL: test_ucmp_w_512:
    706 ; AVX512F-32:       # BB#0:
    707 ; AVX512F-32-NEXT:    vpcmpequw %zmm1, %zmm0, %k0
    708 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    709 ; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
    710 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    711 ; AVX512F-32-NEXT:    addl %eax, %ecx
    712 ; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
    713 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    714 ; AVX512F-32-NEXT:    addl %ecx, %eax
    715 ; AVX512F-32-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0
    716 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    717 ; AVX512F-32-NEXT:    addl %eax, %ecx
    718 ; AVX512F-32-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0
    719 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    720 ; AVX512F-32-NEXT:    addl %ecx, %eax
    721 ; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
    722 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    723 ; AVX512F-32-NEXT:    addl %eax, %ecx
    724 ; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
    725 ; AVX512F-32-NEXT:    kmovd %k0, %edx
    726 ; AVX512F-32-NEXT:    addl %ecx, %edx
    727 ; AVX512F-32-NEXT:    vpcmporduw %zmm1, %zmm0, %k0
    728 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    729 ; AVX512F-32-NEXT:    addl %edx, %eax
    730 ; AVX512F-32-NEXT:    retl
    731   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
    732   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
    733   %ret1 = add i32 %res0, %res1
    734   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
    735   %ret2 = add i32 %ret1, %res2
    736   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
    737   %ret3 = add i32 %ret2, %res3
    738   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
    739   %ret4 = add i32 %ret3, %res4
    740   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
    741   %ret5 = add i32 %ret4, %res5
    742   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
    743   %ret6 = add i32 %ret5, %res6
    744   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
    745   %ret7 = add i32 %ret6, %res7
    746   ret i32 %ret7
    747 }
    748 
    749 define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
    750 ; AVX512BW-LABEL: test_mask_ucmp_w_512:
    751 ; AVX512BW:       ## BB#0:
    752 ; AVX512BW-NEXT:    kmovd %edi, %k1
    753 ; AVX512BW-NEXT:    vpcmpequw %zmm1, %zmm0, %k0 {%k1}
    754 ; AVX512BW-NEXT:    kmovd %k0, %eax
    755 ; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
    756 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    757 ; AVX512BW-NEXT:    addl %eax, %ecx
    758 ; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
    759 ; AVX512BW-NEXT:    kmovd %k0, %eax
    760 ; AVX512BW-NEXT:    addl %ecx, %eax
    761 ; AVX512BW-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0 {%k1}
    762 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    763 ; AVX512BW-NEXT:    addl %eax, %ecx
    764 ; AVX512BW-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0 {%k1}
    765 ; AVX512BW-NEXT:    kmovd %k0, %eax
    766 ; AVX512BW-NEXT:    addl %ecx, %eax
    767 ; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
    768 ; AVX512BW-NEXT:    kmovd %k0, %ecx
    769 ; AVX512BW-NEXT:    addl %eax, %ecx
    770 ; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
    771 ; AVX512BW-NEXT:    kmovd %k0, %edx
    772 ; AVX512BW-NEXT:    addl %ecx, %edx
    773 ; AVX512BW-NEXT:    vpcmporduw %zmm1, %zmm0, %k0 {%k1}
    774 ; AVX512BW-NEXT:    kmovd %k0, %eax
    775 ; AVX512BW-NEXT:    addl %edx, %eax
    776 ; AVX512BW-NEXT:    retq
    777 ;
    778 ; AVX512F-32-LABEL: test_mask_ucmp_w_512:
    779 ; AVX512F-32:       # BB#0:
    780 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    781 ; AVX512F-32-NEXT:    vpcmpequw %zmm1, %zmm0, %k0 {%k1}
    782 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    783 ; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
    784 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    785 ; AVX512F-32-NEXT:    addl %eax, %ecx
    786 ; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
    787 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    788 ; AVX512F-32-NEXT:    addl %ecx, %eax
    789 ; AVX512F-32-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0 {%k1}
    790 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    791 ; AVX512F-32-NEXT:    addl %eax, %ecx
    792 ; AVX512F-32-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0 {%k1}
    793 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    794 ; AVX512F-32-NEXT:    addl %ecx, %eax
    795 ; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
    796 ; AVX512F-32-NEXT:    kmovd %k0, %ecx
    797 ; AVX512F-32-NEXT:    addl %eax, %ecx
    798 ; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
    799 ; AVX512F-32-NEXT:    kmovd %k0, %edx
    800 ; AVX512F-32-NEXT:    addl %ecx, %edx
    801 ; AVX512F-32-NEXT:    vpcmporduw %zmm1, %zmm0, %k0 {%k1}
    802 ; AVX512F-32-NEXT:    kmovd %k0, %eax
    803 ; AVX512F-32-NEXT:    addl %edx, %eax
    804 ; AVX512F-32-NEXT:    retl
    805   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
    806   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
    807   %ret1 = add i32 %res0, %res1
    808   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
    809   %ret2 = add i32 %ret1, %res2
    810   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
    811   %ret3 = add i32 %ret2, %res3
    812   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
    813   %ret4 = add i32 %ret3, %res4
    814   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
    815   %ret5 = add i32 %ret4, %res5
    816   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
    817   %ret6 = add i32 %ret5, %res6
    818   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
    819   %ret7 = add i32 %ret6, %res7
    820   ret i32 %ret7
    821 }
    822 
    823 declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
    824 
    825 declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly
    826 
    827 define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) {
    828 ; AVX512BW-LABEL: test_x86_mask_blend_w_512:
    829 ; AVX512BW:       ## BB#0:
    830 ; AVX512BW-NEXT:    kmovd %edi, %k1
    831 ; AVX512BW-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
    832 ; AVX512BW-NEXT:    retq
    833 ;
    834 ; AVX512F-32-LABEL: test_x86_mask_blend_w_512:
    835 ; AVX512F-32:       # BB#0:
    836 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    837 ; AVX512F-32-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
    838 ; AVX512F-32-NEXT:    retl
    839     %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
    840   ret <32 x i16> %res
    841 }
    842 declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly
    843 
    844 define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
    845 ; AVX512BW-LABEL: test_x86_mask_blend_b_512:
    846 ; AVX512BW:       ## BB#0:
    847 ; AVX512BW-NEXT:    kmovq %rdi, %k1
    848 ; AVX512BW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
    849 ; AVX512BW-NEXT:    retq
    850 ;
    851 ; AVX512F-32-LABEL: test_x86_mask_blend_b_512:
    852 ; AVX512F-32:       # BB#0:
    853 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
    854 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    855 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
    856 ; AVX512F-32-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
    857 ; AVX512F-32-NEXT:    retl
    858   %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
    859   ret <64 x i8> %res
    860 }
    861 
    862 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
    863 ; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
    864 ; AVX512BW:       ## BB#0:
    865 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
    866 ; AVX512BW-NEXT:    retq
    867 ;
    868 ; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
    869 ; AVX512F-32:       # BB#0:
    870 ; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
    871 ; AVX512F-32-NEXT:    retl
    872   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
    873   ret <32 x i16> %res
    874 }
    875 
    876 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
    877 ; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
    878 ; AVX512BW:       ## BB#0:
    879 ; AVX512BW-NEXT:    kmovd %edi, %k1
    880 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
    881 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
    882 ; AVX512BW-NEXT:    retq
    883 ;
    884 ; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
    885 ; AVX512F-32:       # BB#0:
    886 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    887 ; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
    888 ; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
    889 ; AVX512F-32-NEXT:    retl
    890   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
    891   ret <32 x i16> %res
    892 }
    893 
    894 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
    895 ; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
    896 ; AVX512BW:       ## BB#0:
    897 ; AVX512BW-NEXT:    kmovd %edi, %k1
    898 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
    899 ; AVX512BW-NEXT:    retq
    900 ;
    901 ; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
    902 ; AVX512F-32:       # BB#0:
    903 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    904 ; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
    905 ; AVX512F-32-NEXT:    retl
    906   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
    907   ret <32 x i16> %res
    908 }
    909 
    910 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
    911 ; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
    912 ; AVX512BW:       ## BB#0:
    913 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
    914 ; AVX512BW-NEXT:    retq
    915 ;
    916 ; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
    917 ; AVX512F-32:       # BB#0:
    918 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    919 ; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
    920 ; AVX512F-32-NEXT:    retl
    921   %b = load <16 x i32>, <16 x i32>* %ptr_b
    922   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
    923   ret <32 x i16> %res
    924 }
    925 
    926 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
    927 ; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
    928 ; AVX512BW:       ## BB#0:
    929 ; AVX512BW-NEXT:    kmovd %esi, %k1
    930 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
    931 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
    932 ; AVX512BW-NEXT:    retq
    933 ;
    934 ; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
    935 ; AVX512F-32:       # BB#0:
    936 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    937 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    938 ; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
    939 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
    940 ; AVX512F-32-NEXT:    retl
    941   %b = load <16 x i32>, <16 x i32>* %ptr_b
    942   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
    943   ret <32 x i16> %res
    944 }
    945 
    946 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
    947 ; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
    948 ; AVX512BW:       ## BB#0:
    949 ; AVX512BW-NEXT:    kmovd %esi, %k1
    950 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
    951 ; AVX512BW-NEXT:    retq
    952 ;
    953 ; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
    954 ; AVX512F-32:       # BB#0:
    955 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    956 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    957 ; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
    958 ; AVX512F-32-NEXT:    retl
    959   %b = load <16 x i32>, <16 x i32>* %ptr_b
    960   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
    961   ret <32 x i16> %res
    962 }
    963 
    964 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
    965 ; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
    966 ; AVX512BW:       ## BB#0:
    967 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
    968 ; AVX512BW-NEXT:    retq
    969 ;
    970 ; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
    971 ; AVX512F-32:       # BB#0:
    972 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    973 ; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
    974 ; AVX512F-32-NEXT:    retl
    975   %q = load i32, i32* %ptr_b
    976   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
    977   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
    978   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
    979   ret <32 x i16> %res
    980 }
    981 
    982 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
    983 ; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
    984 ; AVX512BW:       ## BB#0:
    985 ; AVX512BW-NEXT:    kmovd %esi, %k1
    986 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
    987 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
    988 ; AVX512BW-NEXT:    retq
    989 ;
    990 ; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
    991 ; AVX512F-32:       # BB#0:
    992 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    993 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    994 ; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
    995 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
    996 ; AVX512F-32-NEXT:    retl
    997   %q = load i32, i32* %ptr_b
    998   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
    999   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   1000   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   1001   ret <32 x i16> %res
   1002 }
   1003 
   1004 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
   1005 ; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
   1006 ; AVX512BW:       ## BB#0:
   1007 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1008 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
   1009 ; AVX512BW-NEXT:    retq
   1010 ;
   1011 ; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
   1012 ; AVX512F-32:       # BB#0:
   1013 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1014 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1015 ; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
   1016 ; AVX512F-32-NEXT:    retl
   1017   %q = load i32, i32* %ptr_b
   1018   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   1019   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   1020   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   1021   ret <32 x i16> %res
   1022 }
   1023 
   1024 declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
   1025 
   1026 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1027 ; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
   1028 ; AVX512BW:       ## BB#0:
   1029 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
   1030 ; AVX512BW-NEXT:    retq
   1031 ;
   1032 ; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
   1033 ; AVX512F-32:       # BB#0:
   1034 ; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
   1035 ; AVX512F-32-NEXT:    retl
   1036   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   1037   ret <64 x i8> %res
   1038 }
   1039 
   1040 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
   1041 ; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
   1042 ; AVX512BW:       ## BB#0:
   1043 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   1044 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
   1045 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
   1046 ; AVX512BW-NEXT:    retq
   1047 ;
   1048 ; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
   1049 ; AVX512F-32:       # BB#0:
   1050 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1051 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1052 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1053 ; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
   1054 ; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
   1055 ; AVX512F-32-NEXT:    retl
   1056   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   1057   ret <64 x i8> %res
   1058 }
   1059 
   1060 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
   1061 ; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
   1062 ; AVX512BW:       ## BB#0:
   1063 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   1064 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
   1065 ; AVX512BW-NEXT:    retq
   1066 ;
   1067 ; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
   1068 ; AVX512F-32:       # BB#0:
   1069 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1070 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1071 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1072 ; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
   1073 ; AVX512F-32-NEXT:    retl
   1074   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   1075   ret <64 x i8> %res
   1076 }
   1077 
   1078 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1079 ; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
   1080 ; AVX512BW:       ## BB#0:
   1081 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
   1082 ; AVX512BW-NEXT:    retq
   1083 ;
   1084 ; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
   1085 ; AVX512F-32:       # BB#0:
   1086 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1087 ; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
   1088 ; AVX512F-32-NEXT:    retl
   1089   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1090   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   1091   ret <64 x i8> %res
   1092 }
   1093 
   1094 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
   1095 ; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
   1096 ; AVX512BW:       ## BB#0:
   1097 ; AVX512BW-NEXT:    kmovq %rsi, %k1
   1098 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
   1099 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
   1100 ; AVX512BW-NEXT:    retq
   1101 ;
   1102 ; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
   1103 ; AVX512F-32:       # BB#0:
   1104 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1105 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1106 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1107 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1108 ; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
   1109 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
   1110 ; AVX512F-32-NEXT:    retl
   1111   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1112   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   1113   ret <64 x i8> %res
   1114 }
   1115 
   1116 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
   1117 ; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
   1118 ; AVX512BW:       ## BB#0:
   1119 ; AVX512BW-NEXT:    kmovq %rsi, %k1
   1120 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
   1121 ; AVX512BW-NEXT:    retq
   1122 ;
   1123 ; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
   1124 ; AVX512F-32:       # BB#0:
   1125 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1126 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1127 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1128 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1129 ; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
   1130 ; AVX512F-32-NEXT:    retl
   1131   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1132   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   1133   ret <64 x i8> %res
   1134 }
   1135 
   1136 declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
   1137 
   1138 
   1139 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
   1140 ; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
   1141 ; AVX512BW:       ## BB#0:
   1142 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
   1143 ; AVX512BW-NEXT:    retq
   1144 ;
   1145 ; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
   1146 ; AVX512F-32:       # BB#0:
   1147 ; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
   1148 ; AVX512F-32-NEXT:    retl
   1149   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   1150   ret <32 x i16> %res
   1151 }
   1152 
   1153 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
   1154 ; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
   1155 ; AVX512BW:       ## BB#0:
   1156 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1157 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
   1158 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
   1159 ; AVX512BW-NEXT:    retq
   1160 ;
   1161 ; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
   1162 ; AVX512F-32:       # BB#0:
   1163 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1164 ; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
   1165 ; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
   1166 ; AVX512F-32-NEXT:    retl
   1167   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   1168   ret <32 x i16> %res
   1169 }
   1170 
   1171 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
   1172 ; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
   1173 ; AVX512BW:       ## BB#0:
   1174 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1175 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1176 ; AVX512BW-NEXT:    retq
   1177 ;
   1178 ; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
   1179 ; AVX512F-32:       # BB#0:
   1180 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1181 ; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1182 ; AVX512F-32-NEXT:    retl
   1183   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   1184   ret <32 x i16> %res
   1185 }
   1186 
   1187 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
   1188 ; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
   1189 ; AVX512BW:       ## BB#0:
   1190 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
   1191 ; AVX512BW-NEXT:    retq
   1192 ;
   1193 ; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
   1194 ; AVX512F-32:       # BB#0:
   1195 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1196 ; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
   1197 ; AVX512F-32-NEXT:    retl
   1198   %b = load <16 x i32>, <16 x i32>* %ptr_b
   1199   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   1200   ret <32 x i16> %res
   1201 }
   1202 
   1203 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1204 ; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
   1205 ; AVX512BW:       ## BB#0:
   1206 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1207 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
   1208 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
   1209 ; AVX512BW-NEXT:    retq
   1210 ;
   1211 ; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
   1212 ; AVX512F-32:       # BB#0:
   1213 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1214 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1215 ; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
   1216 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
   1217 ; AVX512F-32-NEXT:    retl
   1218   %b = load <16 x i32>, <16 x i32>* %ptr_b
   1219   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   1220   ret <32 x i16> %res
   1221 }
   1222 
   1223 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
   1224 ; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
   1225 ; AVX512BW:       ## BB#0:
   1226 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1227 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
   1228 ; AVX512BW-NEXT:    retq
   1229 ;
   1230 ; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
   1231 ; AVX512F-32:       # BB#0:
   1232 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1233 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1234 ; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
   1235 ; AVX512F-32-NEXT:    retl
   1236   %b = load <16 x i32>, <16 x i32>* %ptr_b
   1237   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   1238   ret <32 x i16> %res
   1239 }
   1240 
   1241 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
   1242 ; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
   1243 ; AVX512BW:       ## BB#0:
   1244 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
   1245 ; AVX512BW-NEXT:    retq
   1246 ;
   1247 ; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
   1248 ; AVX512F-32:       # BB#0:
   1249 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1250 ; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
   1251 ; AVX512F-32-NEXT:    retl
   1252   %q = load i32, i32* %ptr_b
   1253   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   1254   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   1255   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   1256   ret <32 x i16> %res
   1257 }
   1258 
   1259 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1260 ; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
   1261 ; AVX512BW:       ## BB#0:
   1262 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1263 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
   1264 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
   1265 ; AVX512BW-NEXT:    retq
   1266 ;
   1267 ; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
   1268 ; AVX512F-32:       # BB#0:
   1269 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1270 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1271 ; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
   1272 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
   1273 ; AVX512F-32-NEXT:    retl
   1274   %q = load i32, i32* %ptr_b
   1275   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   1276   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   1277   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   1278   ret <32 x i16> %res
   1279 }
   1280 
   1281 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
   1282 ; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
   1283 ; AVX512BW:       ## BB#0:
   1284 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1285 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
   1286 ; AVX512BW-NEXT:    retq
   1287 ;
   1288 ; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
   1289 ; AVX512F-32:       # BB#0:
   1290 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1291 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1292 ; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
   1293 ; AVX512F-32-NEXT:    retl
   1294   %q = load i32, i32* %ptr_b
   1295   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   1296   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   1297   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   1298   ret <32 x i16> %res
   1299 }
   1300 
   1301 declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
   1302 
   1303 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1304 ; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
   1305 ; AVX512BW:       ## BB#0:
   1306 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
   1307 ; AVX512BW-NEXT:    retq
   1308 ;
   1309 ; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
   1310 ; AVX512F-32:       # BB#0:
   1311 ; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
   1312 ; AVX512F-32-NEXT:    retl
   1313   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   1314   ret <64 x i8> %res
   1315 }
   1316 
   1317 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
   1318 ; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
   1319 ; AVX512BW:       ## BB#0:
   1320 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   1321 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
   1322 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
   1323 ; AVX512BW-NEXT:    retq
   1324 ;
   1325 ; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
   1326 ; AVX512F-32:       # BB#0:
   1327 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1328 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1329 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1330 ; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
   1331 ; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
   1332 ; AVX512F-32-NEXT:    retl
   1333   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   1334   ret <64 x i8> %res
   1335 }
   1336 
   1337 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
   1338 ; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
   1339 ; AVX512BW:       ## BB#0:
   1340 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   1341 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
   1342 ; AVX512BW-NEXT:    retq
   1343 ;
   1344 ; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
   1345 ; AVX512F-32:       # BB#0:
   1346 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1347 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1348 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1349 ; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
   1350 ; AVX512F-32-NEXT:    retl
   1351   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   1352   ret <64 x i8> %res
   1353 }
   1354 
   1355 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1356 ; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
   1357 ; AVX512BW:       ## BB#0:
   1358 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
   1359 ; AVX512BW-NEXT:    retq
   1360 ;
   1361 ; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
   1362 ; AVX512F-32:       # BB#0:
   1363 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1364 ; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
   1365 ; AVX512F-32-NEXT:    retl
   1366   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1367   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   1368   ret <64 x i8> %res
   1369 }
   1370 
   1371 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
   1372 ; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
   1373 ; AVX512BW:       ## BB#0:
   1374 ; AVX512BW-NEXT:    kmovq %rsi, %k1
   1375 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
   1376 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
   1377 ; AVX512BW-NEXT:    retq
   1378 ;
   1379 ; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
   1380 ; AVX512F-32:       # BB#0:
   1381 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1382 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1383 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1384 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1385 ; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
   1386 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
   1387 ; AVX512F-32-NEXT:    retl
   1388   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1389   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   1390   ret <64 x i8> %res
   1391 }
   1392 
   1393 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
   1394 ; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
   1395 ; AVX512BW:       ## BB#0:
   1396 ; AVX512BW-NEXT:    kmovq %rsi, %k1
   1397 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
   1398 ; AVX512BW-NEXT:    retq
   1399 ;
   1400 ; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
   1401 ; AVX512F-32:       # BB#0:
   1402 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1403 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1404 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1405 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1406 ; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
   1407 ; AVX512F-32-NEXT:    retl
   1408   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1409   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   1410   ret <64 x i8> %res
   1411 }
   1412 
   1413 declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
   1414 
   1415 define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1416 ; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
   1417 ; AVX512BW:       ## BB#0:
   1418 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
   1419 ; AVX512BW-NEXT:    retq
   1420 ;
   1421 ; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512:
   1422 ; AVX512F-32:       # BB#0:
   1423 ; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
   1424 ; AVX512F-32-NEXT:    retl
   1425   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1426   ret <32 x i16> %res
   1427 }
   1428 
   1429 define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
   1430 ; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
   1431 ; AVX512BW:       ## BB#0:
   1432 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1433 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
   1434 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
   1435 ; AVX512BW-NEXT:    retq
   1436 ;
   1437 ; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
   1438 ; AVX512F-32:       # BB#0:
   1439 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1440 ; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
   1441 ; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
   1442 ; AVX512F-32-NEXT:    retl
   1443   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1444   ret <32 x i16> %res
   1445 }
   1446 
   1447 define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
   1448 ; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
   1449 ; AVX512BW:       ## BB#0:
   1450 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1451 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1452 ; AVX512BW-NEXT:    retq
   1453 ;
   1454 ; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512:
   1455 ; AVX512F-32:       # BB#0:
   1456 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1457 ; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1458 ; AVX512F-32-NEXT:    retl
   1459   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1460   ret <32 x i16> %res
   1461 }
   1462 
   1463 define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1464 ; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
   1465 ; AVX512BW:       ## BB#0:
   1466 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
   1467 ; AVX512BW-NEXT:    retq
   1468 ;
   1469 ; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512:
   1470 ; AVX512F-32:       # BB#0:
   1471 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1472 ; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0
   1473 ; AVX512F-32-NEXT:    retl
   1474   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1475   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1476   ret <32 x i16> %res
   1477 }
   1478 
   1479 define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1480 ; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512:
   1481 ; AVX512BW:       ## BB#0:
   1482 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1483 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
   1484 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
   1485 ; AVX512BW-NEXT:    retq
   1486 ;
   1487 ; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512:
   1488 ; AVX512F-32:       # BB#0:
   1489 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1490 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1491 ; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm1 {%k1}
   1492 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
   1493 ; AVX512F-32-NEXT:    retl
   1494   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1495   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1496   ret <32 x i16> %res
   1497 }
   1498 
   1499 define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
   1500 ; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512:
   1501 ; AVX512BW:       ## BB#0:
   1502 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1503 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
   1504 ; AVX512BW-NEXT:    retq
   1505 ;
   1506 ; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512:
   1507 ; AVX512F-32:       # BB#0:
   1508 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1509 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1510 ; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z}
   1511 ; AVX512F-32-NEXT:    retl
   1512   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1513   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1514   ret <32 x i16> %res
   1515 }
   1516 
   1517 declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1518 
   1519 define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1520 ; AVX512BW-LABEL: test_mask_subs_epi16_rr_512:
   1521 ; AVX512BW:       ## BB#0:
   1522 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
   1523 ; AVX512BW-NEXT:    retq
   1524 ;
   1525 ; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512:
   1526 ; AVX512F-32:       # BB#0:
   1527 ; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
   1528 ; AVX512F-32-NEXT:    retl
   1529   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1530   ret <32 x i16> %res
   1531 }
   1532 
   1533 define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
   1534 ; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512:
   1535 ; AVX512BW:       ## BB#0:
   1536 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1537 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
   1538 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
   1539 ; AVX512BW-NEXT:    retq
   1540 ;
   1541 ; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512:
   1542 ; AVX512F-32:       # BB#0:
   1543 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1544 ; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
   1545 ; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
   1546 ; AVX512F-32-NEXT:    retl
   1547   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1548   ret <32 x i16> %res
   1549 }
   1550 
   1551 define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
   1552 ; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512:
   1553 ; AVX512BW:       ## BB#0:
   1554 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1555 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1556 ; AVX512BW-NEXT:    retq
   1557 ;
   1558 ; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512:
   1559 ; AVX512F-32:       # BB#0:
   1560 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1561 ; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1562 ; AVX512F-32-NEXT:    retl
   1563   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1564   ret <32 x i16> %res
   1565 }
   1566 
   1567 define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1568 ; AVX512BW-LABEL: test_mask_subs_epi16_rm_512:
   1569 ; AVX512BW:       ## BB#0:
   1570 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0
   1571 ; AVX512BW-NEXT:    retq
   1572 ;
   1573 ; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512:
   1574 ; AVX512F-32:       # BB#0:
   1575 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1576 ; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0
   1577 ; AVX512F-32-NEXT:    retl
   1578   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1579   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1580   ret <32 x i16> %res
   1581 }
   1582 
   1583 define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1584 ; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512:
   1585 ; AVX512BW:       ## BB#0:
   1586 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1587 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
   1588 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
   1589 ; AVX512BW-NEXT:    retq
   1590 ;
   1591 ; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512:
   1592 ; AVX512F-32:       # BB#0:
   1593 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1594 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1595 ; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm1 {%k1}
   1596 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
   1597 ; AVX512F-32-NEXT:    retl
   1598   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1599   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1600   ret <32 x i16> %res
   1601 }
   1602 
   1603 define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
   1604 ; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512:
   1605 ; AVX512BW:       ## BB#0:
   1606 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1607 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
   1608 ; AVX512BW-NEXT:    retq
   1609 ;
   1610 ; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512:
   1611 ; AVX512F-32:       # BB#0:
   1612 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1613 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1614 ; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z}
   1615 ; AVX512F-32-NEXT:    retl
   1616   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1617   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1618   ret <32 x i16> %res
   1619 }
   1620 
   1621 declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1622 
   1623 define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1624 ; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
   1625 ; AVX512BW:       ## BB#0:
   1626 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
   1627 ; AVX512BW-NEXT:    retq
   1628 ;
   1629 ; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
   1630 ; AVX512F-32:       # BB#0:
   1631 ; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
   1632 ; AVX512F-32-NEXT:    retl
   1633   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1634   ret <32 x i16> %res
   1635 }
   1636 
   1637 define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
   1638 ; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
   1639 ; AVX512BW:       ## BB#0:
   1640 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1641 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
   1642 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
   1643 ; AVX512BW-NEXT:    retq
   1644 ;
   1645 ; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
   1646 ; AVX512F-32:       # BB#0:
   1647 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1648 ; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
   1649 ; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
   1650 ; AVX512F-32-NEXT:    retl
   1651   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1652   ret <32 x i16> %res
   1653 }
   1654 
   1655 define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
   1656 ; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
   1657 ; AVX512BW:       ## BB#0:
   1658 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1659 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1660 ; AVX512BW-NEXT:    retq
   1661 ;
   1662 ; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
   1663 ; AVX512F-32:       # BB#0:
   1664 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1665 ; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1666 ; AVX512F-32-NEXT:    retl
   1667   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1668   ret <32 x i16> %res
   1669 }
   1670 
   1671 define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1672 ; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
   1673 ; AVX512BW:       ## BB#0:
   1674 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
   1675 ; AVX512BW-NEXT:    retq
   1676 ;
   1677 ; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
   1678 ; AVX512F-32:       # BB#0:
   1679 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1680 ; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0
   1681 ; AVX512F-32-NEXT:    retl
   1682   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1683   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1684   ret <32 x i16> %res
   1685 }
   1686 
   1687 define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1688 ; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
   1689 ; AVX512BW:       ## BB#0:
   1690 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1691 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
   1692 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
   1693 ; AVX512BW-NEXT:    retq
   1694 ;
   1695 ; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
   1696 ; AVX512F-32:       # BB#0:
   1697 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1698 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1699 ; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1}
   1700 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
   1701 ; AVX512F-32-NEXT:    retl
   1702   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1703   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1704   ret <32 x i16> %res
   1705 }
   1706 
   1707 define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
   1708 ; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
   1709 ; AVX512BW:       ## BB#0:
   1710 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1711 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
   1712 ; AVX512BW-NEXT:    retq
   1713 ;
   1714 ; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
   1715 ; AVX512F-32:       # BB#0:
   1716 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1717 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1718 ; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
   1719 ; AVX512F-32-NEXT:    retl
   1720   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1721   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1722   ret <32 x i16> %res
   1723 }
   1724 
   1725 declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1726 
   1727 define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1728 ; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
   1729 ; AVX512BW:       ## BB#0:
   1730 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
   1731 ; AVX512BW-NEXT:    retq
   1732 ;
   1733 ; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
   1734 ; AVX512F-32:       # BB#0:
   1735 ; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
   1736 ; AVX512F-32-NEXT:    retl
   1737   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1738   ret <32 x i16> %res
   1739 }
   1740 
   1741 define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
   1742 ; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
   1743 ; AVX512BW:       ## BB#0:
   1744 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1745 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
   1746 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
   1747 ; AVX512BW-NEXT:    retq
   1748 ;
   1749 ; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
   1750 ; AVX512F-32:       # BB#0:
   1751 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1752 ; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
   1753 ; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
   1754 ; AVX512F-32-NEXT:    retl
   1755   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1756   ret <32 x i16> %res
   1757 }
   1758 
   1759 define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
   1760 ; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
   1761 ; AVX512BW:       ## BB#0:
   1762 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1763 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1764 ; AVX512BW-NEXT:    retq
   1765 ;
   1766 ; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
   1767 ; AVX512F-32:       # BB#0:
   1768 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1769 ; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
   1770 ; AVX512F-32-NEXT:    retl
   1771   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1772   ret <32 x i16> %res
   1773 }
   1774 
   1775 define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1776 ; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
   1777 ; AVX512BW:       ## BB#0:
   1778 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
   1779 ; AVX512BW-NEXT:    retq
   1780 ;
   1781 ; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
   1782 ; AVX512F-32:       # BB#0:
   1783 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1784 ; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0
   1785 ; AVX512F-32-NEXT:    retl
   1786   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1787   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1788   ret <32 x i16> %res
   1789 }
   1790 
   1791 define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1792 ; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
   1793 ; AVX512BW:       ## BB#0:
   1794 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1795 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
   1796 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
   1797 ; AVX512BW-NEXT:    retq
   1798 ;
   1799 ; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
   1800 ; AVX512F-32:       # BB#0:
   1801 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1802 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1803 ; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1}
   1804 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
   1805 ; AVX512F-32-NEXT:    retl
   1806   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1807   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1808   ret <32 x i16> %res
   1809 }
   1810 
   1811 define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
   1812 ; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
   1813 ; AVX512BW:       ## BB#0:
   1814 ; AVX512BW-NEXT:    kmovd %esi, %k1
   1815 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
   1816 ; AVX512BW-NEXT:    retq
   1817 ;
   1818 ; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
   1819 ; AVX512F-32:       # BB#0:
   1820 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1821 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1822 ; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
   1823 ; AVX512F-32-NEXT:    retl
   1824   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1825   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1826   ret <32 x i16> %res
   1827 }
   1828 
   1829 declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1830 
   1831 declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
   1832 
   1833 define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
   1834 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
   1835 ; AVX512BW:       ## BB#0:
   1836 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   1837 ; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
   1838 ; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
   1839 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   1840 ; AVX512BW-NEXT:    retq
   1841 ;
   1842 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
   1843 ; AVX512F-32:       # BB#0:
   1844 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1845 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1846 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1847 ; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
   1848 ; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
   1849 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   1850 ; AVX512F-32-NEXT:    retl
   1851   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   1852   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   1853   %res2 = add <64 x i8> %res, %res1
   1854   ret <64 x i8> %res2
   1855 }
   1856 
   1857 declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1858 
   1859 define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   1860 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
   1861 ; AVX512BW:       ## BB#0:
   1862 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1863 ; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
   1864 ; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
   1865 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   1866 ; AVX512BW-NEXT:    retq
   1867 ;
   1868 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
   1869 ; AVX512F-32:       # BB#0:
   1870 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1871 ; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
   1872 ; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
   1873 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   1874 ; AVX512F-32-NEXT:    retl
   1875   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   1876   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   1877   %res2 = add <32 x i16> %res, %res1
   1878   ret <32 x i16> %res2
   1879 }
   1880 
   1881 declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
   1882 
   1883 define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
   1884 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
   1885 ; AVX512BW:       ## BB#0:
   1886 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   1887 ; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
   1888 ; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
   1889 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   1890 ; AVX512BW-NEXT:    retq
   1891 ;
   1892 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
   1893 ; AVX512F-32:       # BB#0:
   1894 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1895 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1896 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1897 ; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
   1898 ; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
   1899 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   1900 ; AVX512F-32-NEXT:    retl
   1901   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   1902   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   1903   %res2 = add <64 x i8> %res, %res1
   1904   ret <64 x i8> %res2
   1905 }
   1906 
   1907 declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1908 
   1909 define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   1910 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
   1911 ; AVX512BW:       ## BB#0:
   1912 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1913 ; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
   1914 ; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
   1915 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   1916 ; AVX512BW-NEXT:    retq
   1917 ;
   1918 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
   1919 ; AVX512F-32:       # BB#0:
   1920 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1921 ; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
   1922 ; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
   1923 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   1924 ; AVX512F-32-NEXT:    retl
   1925   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   1926   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   1927   %res2 = add <32 x i16> %res, %res1
   1928   ret <32 x i16> %res2
   1929 }
   1930 
   1931 declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
   1932 
   1933 define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
   1934 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_b_512:
   1935 ; AVX512BW:       ## BB#0:
   1936 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   1937 ; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
   1938 ; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
   1939 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   1940 ; AVX512BW-NEXT:    retq
   1941 ;
   1942 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_b_512:
   1943 ; AVX512F-32:       # BB#0:
   1944 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1945 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1946 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1947 ; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
   1948 ; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
   1949 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   1950 ; AVX512F-32-NEXT:    retl
   1951   %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   1952   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   1953   %res2 = add <64 x i8> %res, %res1
   1954   ret <64 x i8> %res2
   1955 }
   1956 
   1957 declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1958 
   1959 define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   1960 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_w_512:
   1961 ; AVX512BW:       ## BB#0:
   1962 ; AVX512BW-NEXT:    kmovd %edi, %k1
   1963 ; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
   1964 ; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
   1965 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   1966 ; AVX512BW-NEXT:    retq
   1967 ;
   1968 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_w_512:
   1969 ; AVX512F-32:       # BB#0:
   1970 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1971 ; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
   1972 ; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
   1973 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   1974 ; AVX512F-32-NEXT:    retl
   1975   %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   1976   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   1977   %res2 = add <32 x i16> %res, %res1
   1978   ret <32 x i16> %res2
   1979 }
   1980 
   1981 declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
   1982 
   1983 define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
   1984 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_b_512:
   1985 ; AVX512BW:       ## BB#0:
   1986 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   1987 ; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
   1988 ; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
   1989 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   1990 ; AVX512BW-NEXT:    retq
   1991 ;
   1992 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_b_512:
   1993 ; AVX512F-32:       # BB#0:
   1994 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   1995 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   1996 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   1997 ; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
   1998 ; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm0
   1999 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2000 ; AVX512F-32-NEXT:    retl
   2001   %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   2002   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   2003   %res2 = add <64 x i8> %res, %res1
   2004   ret <64 x i8> %res2
   2005 }
   2006 
   2007 declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2008 
   2009 define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2010 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_w_512:
   2011 ; AVX512BW:       ## BB#0:
   2012 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2013 ; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
   2014 ; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
   2015 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2016 ; AVX512BW-NEXT:    retq
   2017 ;
   2018 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_w_512:
   2019 ; AVX512F-32:       # BB#0:
   2020 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2021 ; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
   2022 ; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
   2023 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2024 ; AVX512F-32-NEXT:    retl
   2025   %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2026   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2027   %res2 = add <32 x i16> %res, %res1
   2028   ret <32 x i16> %res2
   2029 }
   2030 
   2031 declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2032 
   2033 define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2034 ; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
   2035 ; AVX512BW:       ## BB#0:
   2036 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2037 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
   2038 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
   2039 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
   2040 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
   2041 ; AVX512BW-NEXT:    retq
   2042 ;
   2043 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
   2044 ; AVX512F-32:       # BB#0:
   2045 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2046 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
   2047 ; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
   2048 ; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
   2049 ; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
   2050 ; AVX512F-32-NEXT:    retl
   2051   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2052   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2053   %res2 = add <32 x i16> %res, %res1
   2054   ret <32 x i16> %res2
   2055 }
   2056 
   2057 declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2058 
   2059 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2060 ; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
   2061 ; AVX512BW:       ## BB#0:
   2062 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2063 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
   2064 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
   2065 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
   2066 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
   2067 ; AVX512BW-NEXT:    retq
   2068 ;
   2069 ; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
   2070 ; AVX512F-32:       # BB#0:
   2071 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2072 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
   2073 ; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
   2074 ; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
   2075 ; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
   2076 ; AVX512F-32-NEXT:    retl
   2077   %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2078   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2079   %res2 = add <32 x i16> %res, %res1
   2080   ret <32 x i16> %res2
   2081 }
   2082 
   2083 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2084 
   2085 define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2086 ; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
   2087 ; AVX512BW:       ## BB#0:
   2088 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2089 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
   2090 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
   2091 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
   2092 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
   2093 ; AVX512BW-NEXT:    retq
   2094 ;
   2095 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
   2096 ; AVX512F-32:       # BB#0:
   2097 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2098 ; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
   2099 ; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
   2100 ; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
   2101 ; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
   2102 ; AVX512F-32-NEXT:    retl
   2103   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2104   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2105   %res2 = add <32 x i16> %res, %res1
   2106   ret <32 x i16> %res2
   2107 }
   2108 
   2109 declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
   2110 
   2111 define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
   2112 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512:
   2113 ; AVX512BW:       ## BB#0:
   2114 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   2115 ; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
   2116 ; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
   2117 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2118 ; AVX512BW-NEXT:    retq
   2119 ;
   2120 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_b_512:
   2121 ; AVX512F-32:       # BB#0:
   2122 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   2123 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2124 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   2125 ; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
   2126 ; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
   2127 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2128 ; AVX512F-32-NEXT:    retl
   2129   %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   2130   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   2131   %res2 = add <64 x i8> %res, %res1
   2132   ret <64 x i8> %res2
   2133 }
   2134 
   2135 declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2136 
   2137 define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2138 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512:
   2139 ; AVX512BW:       ## BB#0:
   2140 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2141 ; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
   2142 ; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
   2143 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2144 ; AVX512BW-NEXT:    retq
   2145 ;
   2146 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_w_512:
   2147 ; AVX512F-32:       # BB#0:
   2148 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2149 ; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
   2150 ; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
   2151 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2152 ; AVX512F-32-NEXT:    retl
   2153   %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2154   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2155   %res2 = add <32 x i16> %res, %res1
   2156   ret <32 x i16> %res2
   2157 }
   2158 
   2159 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
   2160 
   2161 define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
   2162 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
   2163 ; AVX512BW:       ## BB#0:
   2164 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   2165 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
   2166 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
   2167 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2168 ; AVX512BW-NEXT:    retq
   2169 ;
   2170 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
   2171 ; AVX512F-32:       # BB#0:
   2172 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   2173 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2174 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   2175 ; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
   2176 ; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
   2177 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2178 ; AVX512F-32-NEXT:    retl
   2179   %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   2180   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   2181   %res2 = add <64 x i8> %res, %res1
   2182   ret <64 x i8> %res2
   2183 }
   2184 
   2185 declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
   2186 
   2187 define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
   2188 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512:
   2189 ; AVX512BW:       ## BB#0:
   2190 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2191 ; AVX512BW-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
   2192 ; AVX512BW-NEXT:    vpabsw %zmm0, %zmm0
   2193 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
   2194 ; AVX512BW-NEXT:    retq
   2195 ;
   2196 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_w_512:
   2197 ; AVX512F-32:       # BB#0:
   2198 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2199 ; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
   2200 ; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm0
   2201 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
   2202 ; AVX512F-32-NEXT:    retl
   2203   %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   2204   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
   2205   %res2 = add <32 x i16> %res, %res1
   2206   ret <32 x i16> %res2
   2207 }
   2208 
   2209 declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
   2210 
   2211 define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
   2212 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512:
   2213 ; AVX512BW:       ## BB#0:
   2214 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   2215 ; AVX512BW-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
   2216 ; AVX512BW-NEXT:    vpabsb %zmm0, %zmm0
   2217 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
   2218 ; AVX512BW-NEXT:    retq
   2219 ;
   2220 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_b_512:
   2221 ; AVX512F-32:       # BB#0:
   2222 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   2223 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2224 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   2225 ; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
   2226 ; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm0
   2227 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
   2228 ; AVX512F-32-NEXT:    retl
   2229   %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   2230   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
   2231   %res2 = add <64 x i8> %res, %res1
   2232   ret <64 x i8> %res2
   2233 }
   2234 
   2235 declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2236 
   2237 define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2238 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
   2239 ; AVX512BW:       ## BB#0:
   2240 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2241 ; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
   2242 ; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
   2243 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2244 ; AVX512BW-NEXT:    retq
   2245 ;
   2246 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
   2247 ; AVX512F-32:       # BB#0:
   2248 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2249 ; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
   2250 ; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
   2251 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2252 ; AVX512F-32-NEXT:    retl
   2253   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2254   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2255   %res2 = add <32 x i16> %res, %res1
   2256   ret <32 x i16> %res2
   2257 }
   2258 
   2259 declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2260 
   2261 define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2262 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
   2263 ; AVX512BW:       ## BB#0:
   2264 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2265 ; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
   2266 ; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
   2267 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2268 ; AVX512BW-NEXT:    retq
   2269 ;
   2270 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
   2271 ; AVX512F-32:       # BB#0:
   2272 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2273 ; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
   2274 ; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
   2275 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2276 ; AVX512F-32-NEXT:    retl
   2277   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2278   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2279   %res2 = add <32 x i16> %res, %res1
   2280   ret <32 x i16> %res2
   2281 }
   2282 
   2283 declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2284 
   2285 define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2286 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
   2287 ; AVX512BW:       ## BB#0:
   2288 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2289 ; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
   2290 ; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
   2291 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2292 ; AVX512BW-NEXT:    retq
   2293 ;
   2294 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
   2295 ; AVX512F-32:       # BB#0:
   2296 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2297 ; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
   2298 ; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
   2299 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2300 ; AVX512F-32-NEXT:    retl
   2301   %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2302   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2303   %res2 = add <32 x i16> %res, %res1
   2304   ret <32 x i16> %res2
   2305 }
   2306 
   2307 declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
   2308 
   2309 define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
   2310 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
   2311 ; AVX512BW:       ## BB#0:
   2312 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2313 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
   2314 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
   2315 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2316 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   2317 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   2318 ; AVX512BW-NEXT:    retq
   2319 ;
   2320 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
   2321 ; AVX512F-32:       # BB#0:
   2322 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2323 ; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
   2324 ; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
   2325 ; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm0
   2326 ; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   2327 ; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   2328 ; AVX512F-32-NEXT:    retl
   2329     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
   2330     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
   2331     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
   2332     %res3 = add <32 x i8> %res0, %res1
   2333     %res4 = add <32 x i8> %res3, %res2
   2334     ret <32 x i8> %res4
   2335 }
   2336 
   2337 declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32)
   2338 
   2339 define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
   2340 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
   2341 ; AVX512BW:       ## BB#0:
   2342 ; AVX512BW-NEXT:    kmovd %esi, %k1
   2343 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi)
   2344 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi) {%k1}
   2345 ; AVX512BW-NEXT:    retq
   2346 ;
   2347 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
   2348 ; AVX512F-32:       # BB#0:
   2349 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2350 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2351 ; AVX512F-32-NEXT:    vpmovwb %zmm0, (%eax)
   2352 ; AVX512F-32-NEXT:    vpmovwb %zmm0, (%eax) {%k1}
   2353 ; AVX512F-32-NEXT:    retl
   2354     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
   2355     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
   2356     ret void
   2357 }
   2358 
   2359 declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
   2360 
   2361 define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
   2362 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
   2363 ; AVX512BW:       ## BB#0:
   2364 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2365 ; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
   2366 ; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
   2367 ; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm0
   2368 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   2369 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   2370 ; AVX512BW-NEXT:    retq
   2371 ;
   2372 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
   2373 ; AVX512F-32:       # BB#0:
   2374 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2375 ; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
   2376 ; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
   2377 ; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm0
   2378 ; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   2379 ; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   2380 ; AVX512F-32-NEXT:    retl
   2381     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
   2382     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
   2383     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
   2384     %res3 = add <32 x i8> %res0, %res1
   2385     %res4 = add <32 x i8> %res3, %res2
   2386     ret <32 x i8> %res4
   2387 }
   2388 
   2389 declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32)
   2390 
   2391 define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
   2392 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
   2393 ; AVX512BW:       ## BB#0:
   2394 ; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi)
   2395 ; AVX512BW-NEXT:    kmovd %esi, %k1
   2396 ; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi) {%k1}
   2397 ; AVX512BW-NEXT:    retq
   2398 ;
   2399 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
   2400 ; AVX512F-32:       # BB#0:
   2401 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2402 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2403 ; AVX512F-32-NEXT:    vpmovswb %zmm0, (%ecx)
   2404 ; AVX512F-32-NEXT:    kmovd %eax, %k1
   2405 ; AVX512F-32-NEXT:    vpmovswb %zmm0, (%ecx) {%k1}
   2406 ; AVX512F-32-NEXT:    retl
   2407     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
   2408     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
   2409     ret void
   2410 }
   2411 
   2412 declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
   2413 
   2414 define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
   2415 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
   2416 ; AVX512BW:       ## BB#0:
   2417 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2418 ; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
   2419 ; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
   2420 ; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm0
   2421 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   2422 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   2423 ; AVX512BW-NEXT:    retq
   2424 ;
   2425 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
   2426 ; AVX512F-32:       # BB#0:
   2427 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2428 ; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
   2429 ; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
   2430 ; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm0
   2431 ; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   2432 ; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   2433 ; AVX512F-32-NEXT:    retl
   2434     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
   2435     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
   2436     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
   2437     %res3 = add <32 x i8> %res0, %res1
   2438     %res4 = add <32 x i8> %res3, %res2
   2439     ret <32 x i8> %res4
   2440 }
   2441 
   2442 declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32)
   2443 
   2444 define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
   2445 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
   2446 ; AVX512BW:       ## BB#0:
   2447 ; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi)
   2448 ; AVX512BW-NEXT:    kmovd %esi, %k1
   2449 ; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi) {%k1}
   2450 ; AVX512BW-NEXT:    retq
   2451 ;
   2452 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
   2453 ; AVX512F-32:       # BB#0:
   2454 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2455 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2456 ; AVX512F-32-NEXT:    vpmovuswb %zmm0, (%ecx)
   2457 ; AVX512F-32-NEXT:    kmovd %eax, %k1
   2458 ; AVX512F-32-NEXT:    vpmovuswb %zmm0, (%ecx) {%k1}
   2459 ; AVX512F-32-NEXT:    retl
   2460     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
   2461     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
   2462     ret void
   2463 }
   2464 
   2465 declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
   2466 
   2467 define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
   2468 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
   2469 ; AVX512BW:       ## BB#0:
   2470 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2471 ; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
   2472 ; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
   2473 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2474 ; AVX512BW-NEXT:    retq
   2475 ;
   2476 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
   2477 ; AVX512F-32:       # BB#0:
   2478 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2479 ; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
   2480 ; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
   2481 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2482 ; AVX512F-32-NEXT:    retl
   2483   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
   2484   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
   2485   %res2 = add <32 x i16> %res, %res1
   2486   ret <32 x i16> %res2
   2487 }
   2488 
   2489 declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
   2490 
   2491 define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
   2492 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
   2493 ; AVX512BW:       ## BB#0:
   2494 ; AVX512BW-NEXT:    kmovw %edi, %k1
   2495 ; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
   2496 ; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
   2497 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   2498 ; AVX512BW-NEXT:    retq
   2499 ;
   2500 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
   2501 ; AVX512F-32:       # BB#0:
   2502 ; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2503 ; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
   2504 ; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
   2505 ; AVX512F-32-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   2506 ; AVX512F-32-NEXT:    retl
   2507   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
   2508   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
   2509   %res2 = add <16 x i32> %res, %res1
   2510   ret <16 x i32> %res2
   2511 }
   2512 
   2513 declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
   2514 
   2515 define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
   2516 ; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
   2517 ; AVX512BW:       ## BB#0:
   2518 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   2519 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
   2520 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
   2521 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2522 ; AVX512BW-NEXT:    retq
   2523 ;
   2524 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
   2525 ; AVX512F-32:       # BB#0:
   2526 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   2527 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2528 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   2529 ; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
   2530 ; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
   2531 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2532 ; AVX512F-32-NEXT:    retl
   2533   %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   2534   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   2535   %res2 = add <64 x i8> %res, %res1
   2536   ret <64 x i8> %res2
   2537 }
   2538 
   2539 declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
   2540 
   2541 define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
   2542 ; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
   2543 ; AVX512BW:       ## BB#0:
   2544 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   2545 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
   2546 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
   2547 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2548 ; AVX512BW-NEXT:    retq
   2549 ;
   2550 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
   2551 ; AVX512F-32:       # BB#0:
   2552 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   2553 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2554 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   2555 ; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
   2556 ; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
   2557 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2558 ; AVX512F-32-NEXT:    retl
   2559   %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   2560   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   2561   %res2 = add <64 x i8> %res, %res1
   2562   ret <64 x i8> %res2
   2563 }
   2564 
   2565 declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2566 
   2567 define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2568 ; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
   2569 ; AVX512BW:       ## BB#0:
   2570 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2571 ; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
   2572 ; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
   2573 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2574 ; AVX512BW-NEXT:    retq
   2575 ;
   2576 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
   2577 ; AVX512F-32:       # BB#0:
   2578 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2579 ; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
   2580 ; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
   2581 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2582 ; AVX512F-32-NEXT:    retl
   2583   %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2584   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2585   %res2 = add <32 x i16> %res, %res1
   2586   ret <32 x i16> %res2
   2587 }
   2588 
   2589 declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2590 
   2591 define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
   2592 ; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
   2593 ; AVX512BW:       ## BB#0:
   2594 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2595 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
   2596 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
   2597 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2598 ; AVX512BW-NEXT:    retq
   2599 ;
   2600 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
   2601 ; AVX512F-32:       # BB#0:
   2602 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2603 ; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
   2604 ; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
   2605 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
   2606 ; AVX512F-32-NEXT:    retl
   2607   %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   2608   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   2609   %res2 = add <32 x i16> %res, %res1
   2610   ret <32 x i16> %res2
   2611 }
   2612 
   2613 declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
   2614 
   2615 define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
   2616 ; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
   2617 ; AVX512BW:       ## BB#0:
   2618 ; AVX512BW-NEXT:    kmovq %rdi, %k1
   2619 ; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
   2620 ; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
   2621 ; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0
   2622 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm1
   2623 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
   2624 ; AVX512BW-NEXT:    retq
   2625 ;
   2626 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
   2627 ; AVX512F-32:       # BB#0:
   2628 ; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3
   2629 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   2630 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2631 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
   2632 ; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
   2633 ; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2634 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
   2635 ; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
   2636 ; AVX512F-32-NEXT:    retl
   2637   %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
   2638   %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
   2639   %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
   2640   %res3 = add <64 x i8> %res, %res1
   2641   %res4 = add <64 x i8> %res3, %res2
   2642   ret <64 x i8> %res4
   2643 }
   2644 
   2645 declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
   2646 
   2647 define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
   2648 ; AVX512BW-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
   2649 ; AVX512BW:       ## BB#0:
   2650 ; AVX512BW-NEXT:    kmovd %edi, %k1
   2651 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
   2652 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
   2653 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
   2654 ; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
   2655 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
   2656 ; AVX512BW-NEXT:    retq
   2657 ;
   2658 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
   2659 ; AVX512F-32:       # BB#0:
   2660 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2661 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
   2662 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
   2663 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
   2664 ; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
   2665 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
   2666 ; AVX512F-32-NEXT:    retl
   2667   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   2668   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
   2669   %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1)
   2670   %res3 = add <32 x i16> %res, %res1
   2671   %res4 = add <32 x i16> %res3, %res2
   2672   ret <32 x i16> %res4
   2673 }
   2674 
   2675 declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
   2676 
   2677 define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
   2678 ; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_dq_512:
   2679 ; AVX512BW:       ## BB#0:
   2680 ; AVX512BW-NEXT:    vpslldq $8, %zmm0, %zmm1
   2681 ; AVX512BW-NEXT:    vpslldq $4, %zmm0, %zmm0
   2682 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
   2683 ; AVX512BW-NEXT:    retq
   2684 ;
   2685 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_dq_512:
   2686 ; AVX512F-32:       # BB#0:
   2687 ; AVX512F-32-NEXT:    vpslldq $8, %zmm0, %zmm1
   2688 ; AVX512F-32-NEXT:    vpslldq $4, %zmm0, %zmm0
   2689 ; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
   2690 ; AVX512F-32-NEXT:    retl
   2691   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
   2692   %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
   2693   %res2 = add <8 x i64> %res, %res1
   2694   ret <8 x i64> %res2
   2695 }
   2696 
   2697 declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
   2698 
   2699 define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
   2700 ; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
   2701 ; AVX512BW:       ## BB#0:
   2702 ; AVX512BW-NEXT:    vpsrldq $8, %zmm0, %zmm1
   2703 ; AVX512BW-NEXT:    vpsrldq $4, %zmm0, %zmm0
   2704 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
   2705 ; AVX512BW-NEXT:    retq
   2706 ;
   2707 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
   2708 ; AVX512F-32:       # BB#0:
   2709 ; AVX512F-32-NEXT:    vpsrldq $8, %zmm0, %zmm1
   2710 ; AVX512F-32-NEXT:    vpsrldq $4, %zmm0, %zmm0
   2711 ; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
   2712 ; AVX512F-32-NEXT:    retl
   2713   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
   2714   %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
   2715   %res2 = add <8 x i64> %res, %res1
   2716   ret <8 x i64> %res2
   2717 }
   2718 declare  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
   2719 
   2720 define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
   2721 ; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512:
   2722 ; AVX512BW:       ## BB#0:
   2723 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
   2724 ; AVX512BW-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
   2725 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
   2726 ; AVX512BW-NEXT:    retq
   2727 ;
   2728 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psadb_w_512:
   2729 ; AVX512F-32:       # BB#0:
   2730 ; AVX512F-32-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
   2731 ; AVX512F-32-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
   2732 ; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
   2733 ; AVX512F-32-NEXT:    retl
   2734   %res = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
   2735   %res1 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
   2736   %res2 = add  <8 x i64> %res, %res1
   2737   ret  <8 x i64> %res2
   2738 }
   2739 
   2740 declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
   2741 
   2742 define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
   2743 ; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
   2744 ; AVX512BW:       ## BB#0:
   2745 ; AVX512BW-NEXT:    kmovd %edi, %k0
   2746 ; AVX512BW-NEXT:    kmovd %esi, %k1
   2747 ; AVX512BW-NEXT:    kunpckwd %k1, %k0, %k0
   2748 ; AVX512BW-NEXT:    kmovd %k0, %eax
   2749 ; AVX512BW-NEXT:    retq
   2750 ;
   2751 ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
   2752 ; AVX512F-32:       # BB#0:
   2753 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   2754 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2755 ; AVX512F-32-NEXT:    kunpckwd %k1, %k0, %k0
   2756 ; AVX512F-32-NEXT:    kmovd %k0, %eax
   2757 ; AVX512F-32-NEXT:    retl
   2758   %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
   2759   ret i32 %res
   2760 }
   2761 
   2762 declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
   2763 
   2764 define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
   2765 ; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
   2766 ; AVX512BW:       ## BB#0:
   2767 ; AVX512BW-NEXT:    kmovq %rdi, %k0
   2768 ; AVX512BW-NEXT:    kmovq %rsi, %k1
   2769 ; AVX512BW-NEXT:    kunpckdq %k1, %k0, %k0
   2770 ; AVX512BW-NEXT:    kmovq %k0, %rax
   2771 ; AVX512BW-NEXT:    retq
   2772 ;
   2773 ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
   2774 ; AVX512F-32:       # BB#0:
   2775 ; AVX512F-32-NEXT:    subl $12, %esp
   2776 ; AVX512F-32-NEXT:  .Ltmp8:
   2777 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
   2778 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
   2779 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
   2780 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
   2781 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
   2782 ; AVX512F-32-NEXT:    movl (%esp), %eax
   2783 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   2784 ; AVX512F-32-NEXT:    addl $12, %esp
   2785 ; AVX512F-32-NEXT:    retl
   2786   %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
   2787   ret i64 %res
   2788 }
   2789