Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
      3 
      4 declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
      5 define i32 @test_kortestz(i16 %a0, i16 %a1) {
      6 ; CHECK-LABEL: test_kortestz:
      7 ; CHECK:       ## BB#0:
      8 ; CHECK-NEXT:    kmovw %esi, %k0
      9 ; CHECK-NEXT:    kmovw %edi, %k1
     10 ; CHECK-NEXT:    kortestw %k0, %k1
     11 ; CHECK-NEXT:    sete %al
     12 ; CHECK-NEXT:    kmovw %eax, %k0
     13 ; CHECK-NEXT:    kmovw %k0, %eax
     14 ; CHECK-NEXT:    andl $1, %eax
     15 ; CHECK-NEXT:    retq
     16   %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
     17   ret i32 %res
     18 }
     19 
     20 declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
     21 define i32 @test_kortestc(i16 %a0, i16 %a1) {
     22 ; CHECK-LABEL: test_kortestc:
     23 ; CHECK:       ## BB#0:
     24 ; CHECK-NEXT:    kmovw %esi, %k0
     25 ; CHECK-NEXT:    kmovw %edi, %k1
     26 ; CHECK-NEXT:    kortestw %k0, %k1
     27 ; CHECK-NEXT:    sbbl %eax, %eax
     28 ; CHECK-NEXT:    andl $1, %eax
     29 ; CHECK-NEXT:    retq
     30   %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1)
     31   ret i32 %res
     32 }
     33 
     34 declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
     35 define i16 @test_kand(i16 %a0, i16 %a1) {
     36 ; CHECK-LABEL: test_kand:
     37 ; CHECK:       ## BB#0:
     38 ; CHECK-NEXT:    movw $8, %ax
     39 ; CHECK-NEXT:    kmovw %eax, %k0
     40 ; CHECK-NEXT:    kmovw %edi, %k1
     41 ; CHECK-NEXT:    kandw %k0, %k1, %k0
     42 ; CHECK-NEXT:    kmovw %esi, %k1
     43 ; CHECK-NEXT:    kandw %k1, %k0, %k0
     44 ; CHECK-NEXT:    kmovw %k0, %eax
     45 ; CHECK-NEXT:    retq
     46   %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
     47   %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
     48   ret i16 %t2
     49 }
     50 
     51 declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
     52 define i16 @test_knot(i16 %a0) {
     53 ; CHECK-LABEL: test_knot:
     54 ; CHECK:       ## BB#0:
     55 ; CHECK-NEXT:    kmovw %edi, %k0
     56 ; CHECK-NEXT:    knotw %k0, %k0
     57 ; CHECK-NEXT:    kmovw %k0, %eax
     58 ; CHECK-NEXT:    retq
     59   %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
     60   ret i16 %res
     61 }
     62 
     63 declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
     64 
     65 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
     66 ; CHECK-LABEL: unpckbw_test:
     67 ; CHECK:       ## BB#0:
     68 ; CHECK-NEXT:    kmovw %edi, %k0
     69 ; CHECK-NEXT:    kmovw %esi, %k1
     70 ; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
     71 ; CHECK-NEXT:    kmovw %k0, %eax
     72 ; CHECK-NEXT:    retq
     73   %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
     74   ret i16 %res
     75 }
     76 
     77 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
     78 ; CHECK-LABEL: test_rcp_ps_512:
     79 ; CHECK:       ## BB#0:
     80 ; CHECK-NEXT:    vrcp14ps %zmm0, %zmm0
     81 ; CHECK-NEXT:    retq
     82   %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
     83   ret <16 x float> %res
     84 }
     85 declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
     86 
     87 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
     88 ; CHECK-LABEL: test_rcp_pd_512:
     89 ; CHECK:       ## BB#0:
     90 ; CHECK-NEXT:    vrcp14pd %zmm0, %zmm0
     91 ; CHECK-NEXT:    retq
     92   %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
     93   ret <8 x double> %res
     94 }
     95 declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
     96 
     97 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
     98 
     99 define <8 x double> @test7(<8 x double> %a) {
    100 ; CHECK-LABEL: test7:
    101 ; CHECK:       ## BB#0:
    102 ; CHECK-NEXT:    vrndscalepd $11, %zmm0, %zmm0
    103 ; CHECK-NEXT:    retq
    104   %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
    105   ret <8 x double>%res
    106 }
    107 
    108 declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
    109 
    110 define <16 x float> @test8(<16 x float> %a) {
    111 ; CHECK-LABEL: test8:
    112 ; CHECK:       ## BB#0:
    113 ; CHECK-NEXT:    vrndscaleps $11, %zmm0, %zmm0
    114 ; CHECK-NEXT:    retq
    115   %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
    116   ret <16 x float>%res
    117 }
    118 
    119 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
    120 ; CHECK-LABEL: test_rsqrt_ps_512:
    121 ; CHECK:       ## BB#0:
    122 ; CHECK-NEXT:    vrsqrt14ps %zmm0, %zmm0
    123 ; CHECK-NEXT:    retq
    124   %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
    125   ret <16 x float> %res
    126 }
    127 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
    128 
    129 define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
    130 ; CHECK-LABEL: test_rsqrt14_ss:
    131 ; CHECK:       ## BB#0:
    132 ; CHECK-NEXT:    vrsqrt14ss %xmm0, %xmm0, %xmm0
    133 ; CHECK-NEXT:    retq
    134   %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
    135   ret <4 x float> %res
    136 }
    137 declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
    138 
    139 define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
    140 ; CHECK-LABEL: test_rcp14_ss:
    141 ; CHECK:       ## BB#0:
    142 ; CHECK-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0
    143 ; CHECK-NEXT:    retq
    144   %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
    145   ret <4 x float> %res
    146 }
    147 declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
    148 
    149 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
    150 ; CHECK-LABEL: test_sqrt_pd_512:
    151 ; CHECK:       ## BB#0:
    152 ; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
    153 ; CHECK-NEXT:    retq
    154   %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
    155   ret <8 x double> %res
    156 }
    157 declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
    158 
    159 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
    160 ; CHECK-LABEL: test_sqrt_ps_512:
    161 ; CHECK:       ## BB#0:
    162 ; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
    163 ; CHECK-NEXT:    retq
    164   %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
    165   ret <16 x float> %res
    166 }
    167 define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
    168 ; CHECK-LABEL: test_sqrt_round_ps_512:
    169 ; CHECK:       ## BB#0:
    170 ; CHECK-NEXT:    vsqrtps {rz-sae}, %zmm0, %zmm0
    171 ; CHECK-NEXT:    retq
    172   %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3)
    173   ret <16 x float> %res
    174 }
    175 declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
    176 
    177 define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
    178 ; CHECK-LABEL: test_getexp_pd_512:
    179 ; CHECK:       ## BB#0:
    180 ; CHECK-NEXT:    vgetexppd %zmm0, %zmm0
    181 ; CHECK-NEXT:    retq
    182   %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
    183   ret <8 x double> %res
    184 }
    185 define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
    186 ; CHECK-LABEL: test_getexp_round_pd_512:
    187 ; CHECK:       ## BB#0:
    188 ; CHECK-NEXT:    vgetexppd {sae}, %zmm0, %zmm0
    189 ; CHECK-NEXT:    retq
    190   %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 8)
    191   ret <8 x double> %res
    192 }
    193 declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
    194 
    195 define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
    196 ; CHECK-LABEL: test_getexp_ps_512:
    197 ; CHECK:       ## BB#0:
    198 ; CHECK-NEXT:    vgetexpps %zmm0, %zmm0
    199 ; CHECK-NEXT:    retq
    200   %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
    201   ret <16 x float> %res
    202 }
    203 
    204 define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
    205 ; CHECK-LABEL: test_getexp_round_ps_512:
    206 ; CHECK:       ## BB#0:
    207 ; CHECK-NEXT:    vgetexpps {sae}, %zmm0, %zmm0
    208 ; CHECK-NEXT:    retq
    209   %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
    210   ret <16 x float> %res
    211 }
    212 declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
    213 
    214 declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
    215 
    216 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
    217 ; CHECK-LABEL: test_sqrt_ss:
    218 ; CHECK:       ## BB#0:
    219 ; CHECK-NEXT:    andl $1, %edi
    220 ; CHECK-NEXT:    kmovw %edi, %k1
    221 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    222 ; CHECK-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
    223 ; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
    224 ; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
    225 ; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
    226 ; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
    227 ; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
    228 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    229 ; CHECK-NEXT:    retq
    230   %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
    231   %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
    232   %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
    233   %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
    234 
    235   %res.1 = fadd <4 x float> %res0, %res1
    236   %res.2 = fadd <4 x float> %res2, %res3
    237   %res   = fadd <4 x float> %res.1, %res.2
    238   ret <4 x float> %res
    239 }
    240 
    241 declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
    242 
    243 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
    244 ; CHECK-LABEL: test_sqrt_sd:
    245 ; CHECK:       ## BB#0:
    246 ; CHECK-NEXT:    andl $1, %edi
    247 ; CHECK-NEXT:    kmovw %edi, %k1
    248 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    249 ; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
    250 ; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
    251 ; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
    252 ; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
    253 ; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
    254 ; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
    255 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
    256 ; CHECK-NEXT:    retq
    257   %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
    258   %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
    259   %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
    260   %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
    261 
    262   %res.1 = fadd <2 x double> %res0, %res1
    263   %res.2 = fadd <2 x double> %res2, %res3
    264   %res   = fadd <2 x double> %res.1, %res.2
    265   ret <2 x double> %res
    266 }
    267 
    268 define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
    269 ; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
    270 ; CHECK:       ## BB#0:
    271 ; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
    272 ; CHECK-NEXT:    retq
    273   %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
    274   ret i64 %res
    275 }
    276 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
    277 
    278 define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
    279 ; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
    280 ; CHECK:       ## BB#0:
    281 ; CHECK-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
    282 ; CHECK-NEXT:    retq
    283   %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
    284   ret <2 x double> %res
    285 }
    286 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
    287 
    288 define i64 @test_x86_avx512_cvttsd2si64(<2 x double> %a0) {
    289 ; CHECK-LABEL: test_x86_avx512_cvttsd2si64:
    290 ; CHECK:       ## BB#0:
    291 ; CHECK-NEXT:    vcvttsd2si %xmm0, %rcx
    292 ; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %rax
    293 ; CHECK-NEXT:    addq %rcx, %rax
    294 ; CHECK-NEXT:    retq
    295   %res0 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 4) ;
    296   %res1 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 8) ;
    297   %res2 = add i64 %res0, %res1
    298   ret i64 %res2
    299 }
    300 declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) nounwind readnone
    301 
    302 define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
    303 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
    304 ; CHECK:       ## BB#0:
    305 ; CHECK-NEXT:    vcvttsd2usi %xmm0, %ecx
    306 ; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %eax
    307 ; CHECK-NEXT:    addl %ecx, %eax
    308 ; CHECK-NEXT:    retq
    309   %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
    310   %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
    311   %res2 = add i32 %res0, %res1
    312   ret i32 %res2
    313 }
    314 declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
    315 
    316 define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
    317 ; CHECK-LABEL: test_x86_avx512_cvttsd2si:
    318 ; CHECK:       ## BB#0:
    319 ; CHECK-NEXT:    vcvttsd2si %xmm0, %ecx
    320 ; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %eax
    321 ; CHECK-NEXT:    addl %ecx, %eax
    322 ; CHECK-NEXT:    retq
    323   %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
    324   %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
    325   %res2 = add i32 %res0, %res1
    326   ret i32 %res2
    327 }
    328 declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
    329 
    330 
    331 
    332 define i64 @test_x86_avx512_cvttsd2usi64(<2 x double> %a0) {
    333 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi64:
    334 ; CHECK:       ## BB#0:
    335 ; CHECK-NEXT:    vcvttsd2usi %xmm0, %rcx
    336 ; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %rax
    337 ; CHECK-NEXT:    addq %rcx, %rax
    338 ; CHECK-NEXT:    retq
    339   %res0 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 4) ;
    340   %res1 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 8) ;
    341   %res2 = add i64 %res0, %res1
    342   ret i64 %res2
    343 }
    344 declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) nounwind readnone
    345 
    346 define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
    347 ; CHECK-LABEL: test_x86_sse_cvtss2si64:
    348 ; CHECK:       ## BB#0:
    349 ; CHECK-NEXT:    vcvtss2si %xmm0, %rax
    350 ; CHECK-NEXT:    retq
    351   %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
    352   ret i64 %res
    353 }
    354 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
    355 
    356 
    357 define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
    358 ; CHECK-LABEL: test_x86_sse_cvtsi642ss:
    359 ; CHECK:       ## BB#0:
    360 ; CHECK-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
    361 ; CHECK-NEXT:    retq
    362   %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
    363   ret <4 x float> %res
    364 }
    365 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
    366 
    367 
    368 define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
    369 ; CHECK-LABEL: test_x86_avx512_cvttss2si:
    370 ; CHECK:       ## BB#0:
    371 ; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %ecx
    372 ; CHECK-NEXT:    vcvttss2si %xmm0, %eax
    373 ; CHECK-NEXT:    addl %ecx, %eax
    374 ; CHECK-NEXT:    retq
    375   %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
    376   %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
    377   %res2 = add i32 %res0, %res1
    378   ret i32 %res2
    379 }
    380 declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
    381 
    382 define i64 @test_x86_avx512_cvttss2si64(<4 x float> %a0) {
    383 ; CHECK-LABEL: test_x86_avx512_cvttss2si64:
    384 ; CHECK:       ## BB#0:
    385 ; CHECK-NEXT:    vcvttss2si %xmm0, %rcx
    386 ; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %rax
    387 ; CHECK-NEXT:    addq %rcx, %rax
    388 ; CHECK-NEXT:    retq
    389   %res0 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 4) ;
    390   %res1 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 8) ;
    391   %res2 = add i64 %res0, %res1
    392   ret i64 %res2
    393 }
    394 declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32) nounwind readnone
    395 
    396 define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
    397 ; CHECK-LABEL: test_x86_avx512_cvttss2usi:
    398 ; CHECK:       ## BB#0:
    399 ; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %ecx
    400 ; CHECK-NEXT:    vcvttss2usi %xmm0, %eax
    401 ; CHECK-NEXT:    addl %ecx, %eax
    402 ; CHECK-NEXT:    retq
    403   %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
    404   %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
    405   %res2 = add i32 %res0, %res1
    406   ret i32 %res2
    407 }
    408 declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
    409 
    410 define i64 @test_x86_avx512_cvttss2usi64(<4 x float> %a0) {
    411 ; CHECK-LABEL: test_x86_avx512_cvttss2usi64:
    412 ; CHECK:       ## BB#0:
    413 ; CHECK-NEXT:    vcvttss2usi %xmm0, %rcx
    414 ; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %rax
    415 ; CHECK-NEXT:    addq %rcx, %rax
    416 ; CHECK-NEXT:    retq
    417   %res0 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 4) ;
    418   %res1 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 8) ;
    419   %res2 = add i64 %res0, %res1
    420   ret i64 %res2
    421 }
    422 declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
    423 
    424 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
    425 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
    426 ; CHECK:       ## BB#0:
    427 ; CHECK-NEXT:    vcvtsd2usi %xmm0, %rax
    428 ; CHECK-NEXT:    retq
    429   %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
    430   ret i64 %res
    431 }
    432 declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
    433 
    434 define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
    435 ; CHECK-LABEL: test_x86_vcvtph2ps_512:
    436 ; CHECK:       ## BB#0:
    437 ; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
    438 ; CHECK-NEXT:    retq
    439   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
    440   ret <16 x float> %res
    441 }
    442 
    443 define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
    444 ; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
    445 ; CHECK:       ## BB#0:
    446 ; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0
    447 ; CHECK-NEXT:    retq
    448   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
    449   ret <16 x float> %res
    450 }
    451 
    452 define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
    453 ; CHECK-LABEL: test_x86_vcvtph2ps_512_rrk:
    454 ; CHECK:       ## BB#0:
    455 ; CHECK-NEXT:    kmovw %edi, %k1
    456 ; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1}
    457 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    458 ; CHECK-NEXT:    retq
    459   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
    460   ret <16 x float> %res
    461 }
    462 
    463 define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
    464 ; CHECK-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
    465 ; CHECK:       ## BB#0:
    466 ; CHECK-NEXT:    kmovw %edi, %k1
    467 ; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
    468 ; CHECK-NEXT:    retq
    469   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
    470   ret <16 x float> %res
    471 }
    472 
    473 define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
    474 ; CHECK-LABEL: test_x86_vcvtph2ps_512_rrkz:
    475 ; CHECK:       ## BB#0:
    476 ; CHECK-NEXT:    kmovw %edi, %k1
    477 ; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
    478 ; CHECK-NEXT:    retq
    479   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
    480   ret <16 x float> %res
    481 }
    482 
    483 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
    484 
    485 
    486 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
    487 ; CHECK-LABEL: test_x86_vcvtps2ph_256:
    488 ; CHECK:       ## BB#0:
    489 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm0
    490 ; CHECK-NEXT:    retq
    491   %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
    492   ret <16 x i16> %res
    493 }
    494 
    495 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
    496 
    497 define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
    498 ; CHECK-LABEL: test_x86_vbroadcast_ss_512:
    499 ; CHECK:       ## BB#0:
    500 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0
    501 ; CHECK-NEXT:    retq
    502   %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
    503   ret <16 x float> %res
    504 }
    505 declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
    506 
    507 define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
    508 ; CHECK-LABEL: test_x86_vbroadcast_sd_512:
    509 ; CHECK:       ## BB#0:
    510 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0
    511 ; CHECK-NEXT:    retq
    512   %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
    513   ret <8 x double> %res
    514 }
    515 declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
    516 
    517 define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0) {
    518 ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
    519 ; CHECK:       ## BB#0:
    520 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
    521 ; CHECK-NEXT:    retq
    522   %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float> %a0) ; <<16 x float>> [#uses=1]
    523   ret <16 x float> %res
    524 }
    525 declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float>) nounwind readonly
    526 
    527 define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0) {
    528 ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
    529 ; CHECK:       ## BB#0:
    530 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
    531 ; CHECK-NEXT:    retq
    532   %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double> %a0) ; <<8 x double>> [#uses=1]
    533   ret <8 x double> %res
    534 }
    535 declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double>) nounwind readonly
    536 
    537 define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
    538 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
    539 ; CHECK:       ## BB#0:
    540 ; CHECK-NEXT:    kmovw %edi, %k1
    541 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
    542 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2 {%k1} {z}
    543 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0
    544 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    545 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
    546 ; CHECK-NEXT:    retq
    547   %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
    548   %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
    549   %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
    550   %res3 = add <16 x i32> %res, %res1
    551   %res4 = add <16 x i32> %res2, %res3
    552   ret <16 x i32> %res4
    553 }
    554 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
    555 
    556 define <16 x i32> @test_x86_pbroadcastd_i32_512(i32  %a0) {
    557 ; CHECK-LABEL: test_x86_pbroadcastd_i32_512:
    558 ; CHECK:       ## BB#0:
    559 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
    560 ; CHECK-NEXT:    retq
    561   %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1]
    562   ret <16 x i32> %res
    563 }
    564 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly
    565 
    566 define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
    567 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
    568 ; CHECK:       ## BB#0:
    569 ; CHECK-NEXT:    movzbl %dil, %eax
    570 ; CHECK-NEXT:    kmovw %eax, %k1
    571 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
    572 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2 {%k1} {z}
    573 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0
    574 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    575 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    576 ; CHECK-NEXT:    retq
    577   %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
    578   %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
    579   %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
    580   %res3 = add <8 x i64> %res, %res1
    581   %res4 = add <8 x i64> %res2, %res3
    582   ret <8 x i64> %res4
    583 }
    584 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
    585 
    586 define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
    587 ; CHECK-LABEL: test_x86_pbroadcastq_i64_512:
    588 ; CHECK:       ## BB#0:
    589 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
    590 ; CHECK-NEXT:    retq
    591   %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1]
    592   ret <8 x i64> %res
    593 }
    594 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
    595 
    596 define <16 x i32> @test_conflict_d(<16 x i32> %a) {
    597 ; CHECK-LABEL: test_conflict_d:
    598 ; CHECK:       ## BB#0:
    599 ; CHECK-NEXT:    vpconflictd %zmm0, %zmm0
    600 ; CHECK-NEXT:    retq
    601   %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
    602   ret <16 x i32> %res
    603 }
    604 
    605 declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
    606 
    607 define <8 x i64> @test_conflict_q(<8 x i64> %a) {
    608 ; CHECK-LABEL: test_conflict_q:
    609 ; CHECK:       ## BB#0:
    610 ; CHECK-NEXT:    vpconflictq %zmm0, %zmm0
    611 ; CHECK-NEXT:    retq
    612   %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
    613   ret <8 x i64> %res
    614 }
    615 
    616 declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
    617 
    618 define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
    619 ; CHECK-LABEL: test_maskz_conflict_d:
    620 ; CHECK:       ## BB#0:
    621 ; CHECK-NEXT:    kmovw %edi, %k1
    622 ; CHECK-NEXT:    vpconflictd %zmm0, %zmm0 {%k1} {z}
    623 ; CHECK-NEXT:    retq
    624   %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
    625   ret <16 x i32> %res
    626 }
    627 
    628 define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
    629 ; CHECK-LABEL: test_mask_conflict_q:
    630 ; CHECK:       ## BB#0:
    631 ; CHECK-NEXT:    movzbl %dil, %eax
    632 ; CHECK-NEXT:    kmovw %eax, %k1
    633 ; CHECK-NEXT:    vpconflictq %zmm0, %zmm1 {%k1}
    634 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    635 ; CHECK-NEXT:    retq
    636   %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
    637   ret <8 x i64> %res
    638 }
    639 
    640 define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
    641 ; CHECK-LABEL: test_lzcnt_d:
    642 ; CHECK:       ## BB#0:
    643 ; CHECK-NEXT:    vplzcntd %zmm0, %zmm0
    644 ; CHECK-NEXT:    retq
    645   %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
    646   ret <16 x i32> %res
    647 }
    648 
    649 declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
    650 
    651 define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
    652 ; CHECK-LABEL: test_lzcnt_q:
    653 ; CHECK:       ## BB#0:
    654 ; CHECK-NEXT:    vplzcntq %zmm0, %zmm0
    655 ; CHECK-NEXT:    retq
    656   %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
    657   ret <8 x i64> %res
    658 }
    659 
    660 declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
    661 
    662 
    663 define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
    664 ; CHECK-LABEL: test_mask_lzcnt_d:
    665 ; CHECK:       ## BB#0:
    666 ; CHECK-NEXT:    kmovw %edi, %k1
    667 ; CHECK-NEXT:    vplzcntd %zmm0, %zmm1 {%k1}
    668 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    669 ; CHECK-NEXT:    retq
    670   %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
    671   ret <16 x i32> %res
    672 }
    673 
    674 define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
    675 ; CHECK-LABEL: test_mask_lzcnt_q:
    676 ; CHECK:       ## BB#0:
    677 ; CHECK-NEXT:    movzbl %dil, %eax
    678 ; CHECK-NEXT:    kmovw %eax, %k1
    679 ; CHECK-NEXT:    vplzcntq %zmm0, %zmm1 {%k1}
    680 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    681 ; CHECK-NEXT:    retq
    682   %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
    683   ret <8 x i64> %res
    684 }
    685 
    686 define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
    687 ; CHECK-LABEL: test_x86_mask_blend_ps_512:
    688 ; CHECK:       ## BB#0:
    689 ; CHECK-NEXT:    kmovw %edi, %k1
    690 ; CHECK-NEXT:    vblendmps %zmm1, %zmm0, %zmm0 {%k1}
    691 ; CHECK-NEXT:    retq
    692   %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
    693   ret <16 x float> %res
    694 }
    695 
    696 declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
    697 
    698 define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
    699 ; CHECK-LABEL: test_x86_mask_blend_pd_512:
    700 ; CHECK:       ## BB#0:
    701 ; CHECK-NEXT:    movzbl %dil, %eax
    702 ; CHECK-NEXT:    kmovw %eax, %k1
    703 ; CHECK-NEXT:    vblendmpd %zmm1, %zmm0, %zmm0 {%k1}
    704 ; CHECK-NEXT:    retq
    705   %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
    706   ret <8 x double> %res
    707 }
    708 
    709 define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
    710 ; CHECK-LABEL: test_x86_mask_blend_pd_512_memop:
    711 ; CHECK:       ## BB#0:
    712 ; CHECK-NEXT:    movzbl %sil, %eax
    713 ; CHECK-NEXT:    kmovw %eax, %k1
    714 ; CHECK-NEXT:    vblendmpd (%rdi), %zmm0, %zmm0 {%k1}
    715 ; CHECK-NEXT:    retq
    716   %b = load <8 x double>, <8 x double>* %ptr
    717   %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1]
    718   ret <8 x double> %res
    719 }
    720 declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly
    721 
    722 define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
    723 ; CHECK-LABEL: test_x86_mask_blend_d_512:
    724 ; CHECK:       ## BB#0:
    725 ; CHECK-NEXT:    kmovw %edi, %k1
    726 ; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
    727 ; CHECK-NEXT:    retq
    728   %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1]
    729   ret <16 x i32> %res
    730 }
    731 declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
    732 
    733 define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
    734 ; CHECK-LABEL: test_x86_mask_blend_q_512:
    735 ; CHECK:       ## BB#0:
    736 ; CHECK-NEXT:    movzbl %dil, %eax
    737 ; CHECK-NEXT:    kmovw %eax, %k1
    738 ; CHECK-NEXT:    vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
    739 ; CHECK-NEXT:    retq
    740   %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
    741   ret <8 x i64> %res
    742 }
    743 declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
    744 
    745  define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
    746 ; CHECK-LABEL: test_cmpps:
    747 ; CHECK:       ## BB#0:
    748 ; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
    749 ; CHECK-NEXT:    kmovw %k0, %eax
    750 ; CHECK-NEXT:    retq
    751    %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
    752    ret i16 %res
    753  }
    754  declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
    755 
    756  define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
    757 ; CHECK-LABEL: test_cmppd:
    758 ; CHECK:       ## BB#0:
    759 ; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
    760 ; CHECK-NEXT:    kmovw %k0, %eax
    761 ; CHECK-NEXT:    retq
    762    %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
    763    ret i8 %res
    764  }
    765  declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
    766 
    767  ; fp min - max
    768 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
    769 ; CHECK-LABEL: test_vmaxpd:
    770 ; CHECK:       ## BB#0:
    771 ; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
    772 ; CHECK-NEXT:    retq
    773   %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
    774                     <8 x double>zeroinitializer, i8 -1, i32 4)
    775   ret <8 x double> %res
    776 }
    777 declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
    778                     <8 x double>, i8, i32)
    779 
    780 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
    781 ; CHECK-LABEL: test_vminpd:
    782 ; CHECK:       ## BB#0:
    783 ; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
    784 ; CHECK-NEXT:    retq
    785   %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
    786                     <8 x double>zeroinitializer, i8 -1, i32 4)
    787   ret <8 x double> %res
    788 }
    789 declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
    790                     <8 x double>, i8, i32)
    791 
    792  declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
    793 
    794 define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
    795 ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_512:
    796 ; CHECK:       ## BB#0:
    797 ; CHECK-NEXT:    kmovw %edi, %k1
    798 ; CHECK-NEXT:    vpabsd %zmm0, %zmm1 {%k1}
    799 ; CHECK-NEXT:    vpabsd %zmm0, %zmm0
    800 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    801 ; CHECK-NEXT:    retq
    802   %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
    803   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
    804   %res2 = add <16 x i32> %res, %res1
    805   ret <16 x i32> %res2
    806 }
    807 
    808 declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
    809 
    810 define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
    811 ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
    812 ; CHECK:       ## BB#0:
    813 ; CHECK-NEXT:    movzbl %dil, %eax
    814 ; CHECK-NEXT:    kmovw %eax, %k1
    815 ; CHECK-NEXT:    vpabsq %zmm0, %zmm1 {%k1}
    816 ; CHECK-NEXT:    vpabsq %zmm0, %zmm0
    817 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
    818 ; CHECK-NEXT:    retq
    819   %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
    820   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
    821   %res2 = add <8 x i64> %res, %res1
    822   ret <8 x i64> %res2
    823 }
    824 
    825 define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
    826 ; CHECK-LABEL: test_vptestmq:
    827 ; CHECK:       ## BB#0:
    828 ; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0
    829 ; CHECK-NEXT:    kmovw %k0, %eax
    830 ; CHECK-NEXT:    retq
    831   %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
    832   ret i8 %res
    833 }
    834 declare i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
    835 
    836 define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) {
    837 ; CHECK-LABEL: test_vptestmd:
    838 ; CHECK:       ## BB#0:
    839 ; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0
    840 ; CHECK-NEXT:    kmovw %k0, %eax
    841 ; CHECK-NEXT:    retq
    842   %res = call i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
    843   ret i16 %res
    844 }
    845 declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
    846 
    847 define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
    848 ; CHECK-LABEL: test_store1:
    849 ; CHECK:       ## BB#0:
    850 ; CHECK-NEXT:    kmovw %esi, %k1
    851 ; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
    852 ; CHECK-NEXT:    retq
    853   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
    854   ret void
    855 }
    856 
    857 declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
    858 
    859 define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
    860 ; CHECK-LABEL: test_store2:
    861 ; CHECK:       ## BB#0:
    862 ; CHECK-NEXT:    kmovw %esi, %k1
    863 ; CHECK-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
    864 ; CHECK-NEXT:    retq
    865   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
    866   ret void
    867 }
    868 
    869 declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
    870 
    871 define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
    872 ; CHECK-LABEL: test_mask_store_aligned_ps:
    873 ; CHECK:       ## BB#0:
    874 ; CHECK-NEXT:    kmovw %esi, %k1
    875 ; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
    876 ; CHECK-NEXT:    retq
    877   call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
    878   ret void
    879 }
    880 
    881 declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
    882 
    883 define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
    884 ; CHECK-LABEL: test_mask_store_aligned_pd:
    885 ; CHECK:       ## BB#0:
    886 ; CHECK-NEXT:    kmovw %esi, %k1
    887 ; CHECK-NEXT:    vmovapd %zmm0, (%rdi) {%k1}
    888 ; CHECK-NEXT:    retq
    889   call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
    890   ret void
    891 }
    892 
    893 declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
    894 
    895 define <16 x float> @test_maskz_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
    896 ; CHECK-LABEL: test_maskz_load_aligned_ps:
    897 ; CHECK:       ## BB#0:
    898 ; CHECK-NEXT:    kmovw %esi, %k1
    899 ; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z}
    900 ; CHECK-NEXT:    retq
    901   %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
    902   ret <16 x float> %res
    903 }
    904 
    905 declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
    906 
    907 define <8 x double> @test_maskz_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
    908 ; CHECK-LABEL: test_maskz_load_aligned_pd:
    909 ; CHECK:       ## BB#0:
    910 ; CHECK-NEXT:    kmovw %esi, %k1
    911 ; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1} {z}
    912 ; CHECK-NEXT:    retq
    913   %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
    914   ret <8 x double> %res
    915 }
    916 
    917 declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
    918 
    919 define <16 x float> @test_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
    920 ; CHECK-LABEL: test_load_aligned_ps:
    921 ; CHECK:       ## BB#0:
    922 ; CHECK-NEXT:    vmovaps (%rdi), %zmm0
    923 ; CHECK-NEXT:    retq
    924   %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
    925   ret <16 x float> %res
    926 }
    927 
    928 define <8 x double> @test_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
    929 ; CHECK-LABEL: test_load_aligned_pd:
    930 ; CHECK:       ## BB#0:
    931 ; CHECK-NEXT:    vmovapd (%rdi), %zmm0
    932 ; CHECK-NEXT:    retq
    933   %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
    934   ret <8 x double> %res
    935 }
    936 
    937 declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
    938 
    939 define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
    940 ; CHECK-LABEL: test_valign_q:
    941 ; CHECK:       ## BB#0:
    942 ; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm0
    943 ; CHECK-NEXT:    retq
    944   %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
    945   ret <8 x i64> %res
    946 }
    947 
    948 define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
    949 ; CHECK-LABEL: test_mask_valign_q:
    950 ; CHECK:       ## BB#0:
    951 ; CHECK-NEXT:    movzbl %dil, %eax
    952 ; CHECK-NEXT:    kmovw %eax, %k1
    953 ; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
    954 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
    955 ; CHECK-NEXT:    retq
    956   %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
    957   ret <8 x i64> %res
    958 }
    959 
    960 declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
    961 
    962 define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
    963 ; CHECK-LABEL: test_maskz_valign_d:
    964 ; CHECK:       ## BB#0:
    965 ; CHECK-NEXT:    kmovw %edi, %k1
    966 ; CHECK-NEXT:    valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z}
    967 ; CHECK-NEXT:    retq
    968   %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
    969   ret <16 x i32> %res
    970 }
    971 
    972 declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
    973 
    974 define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
    975 ; CHECK-LABEL: test_mask_store_ss:
    976 ; CHECK:       ## BB#0:
    977 ; CHECK-NEXT:    kmovw %esi, %k1
    978 ; CHECK-NEXT:    vmovss %xmm0, (%rdi) {%k1}
    979 ; CHECK-NEXT:    retq
    980  call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask)
    981  ret void
    982 }
    983 
    984 declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
    985 
    986 define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
    987 ; CHECK-LABEL: test_pcmpeq_d:
    988 ; CHECK:       ## BB#0:
    989 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
    990 ; CHECK-NEXT:    kmovw %k0, %eax
    991 ; CHECK-NEXT:    retq
    992   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
    993   ret i16 %res
    994 }
    995 
    996 define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
    997 ; CHECK-LABEL: test_mask_pcmpeq_d:
    998 ; CHECK:       ## BB#0:
    999 ; CHECK-NEXT:    kmovw %edi, %k1
   1000 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
   1001 ; CHECK-NEXT:    kmovw %k0, %eax
   1002 ; CHECK-NEXT:    retq
   1003   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   1004   ret i16 %res
   1005 }
   1006 
   1007 declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
   1008 
   1009 define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
   1010 ; CHECK-LABEL: test_pcmpeq_q:
   1011 ; CHECK:       ## BB#0:
   1012 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
   1013 ; CHECK-NEXT:    kmovw %k0, %eax
   1014 ; CHECK-NEXT:    retq
   1015   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   1016   ret i8 %res
   1017 }
   1018 
   1019 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
   1020 ; CHECK-LABEL: test_mask_pcmpeq_q:
   1021 ; CHECK:       ## BB#0:
   1022 ; CHECK-NEXT:    movzbl %dil, %eax
   1023 ; CHECK-NEXT:    kmovw %eax, %k1
   1024 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
   1025 ; CHECK-NEXT:    kmovw %k0, %eax
   1026 ; CHECK-NEXT:    retq
   1027   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   1028   ret i8 %res
   1029 }
   1030 
   1031 declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
   1032 
   1033 define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
   1034 ; CHECK-LABEL: test_pcmpgt_d:
   1035 ; CHECK:       ## BB#0:
   1036 ; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
   1037 ; CHECK-NEXT:    kmovw %k0, %eax
   1038 ; CHECK-NEXT:    retq
   1039   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   1040   ret i16 %res
   1041 }
   1042 
   1043 define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
   1044 ; CHECK-LABEL: test_mask_pcmpgt_d:
   1045 ; CHECK:       ## BB#0:
   1046 ; CHECK-NEXT:    kmovw %edi, %k1
   1047 ; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
   1048 ; CHECK-NEXT:    kmovw %k0, %eax
   1049 ; CHECK-NEXT:    retq
   1050   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   1051   ret i16 %res
   1052 }
   1053 
   1054 declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
   1055 
   1056 define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
   1057 ; CHECK-LABEL: test_pcmpgt_q:
   1058 ; CHECK:       ## BB#0:
   1059 ; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
   1060 ; CHECK-NEXT:    kmovw %k0, %eax
   1061 ; CHECK-NEXT:    retq
   1062   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   1063   ret i8 %res
   1064 }
   1065 
   1066 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
   1067 ; CHECK-LABEL: test_mask_pcmpgt_q:
   1068 ; CHECK:       ## BB#0:
   1069 ; CHECK-NEXT:    movzbl %dil, %eax
   1070 ; CHECK-NEXT:    kmovw %eax, %k1
   1071 ; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
   1072 ; CHECK-NEXT:    kmovw %k0, %eax
   1073 ; CHECK-NEXT:    retq
   1074   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   1075   ret i8 %res
   1076 }
   1077 
   1078 declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
   1079 
   1080 define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
   1081 ; CHECK-LABEL: test_cmp_d_512:
   1082 ; CHECK:       ## BB#0:
   1083 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
   1084 ; CHECK-NEXT:    kmovw %k0, %r8d
   1085 ; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k0
   1086 ; CHECK-NEXT:    kmovw %k0, %r9d
   1087 ; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k0
   1088 ; CHECK-NEXT:    kmovw %k0, %r10d
   1089 ; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k0
   1090 ; CHECK-NEXT:    kmovw %k0, %esi
   1091 ; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
   1092 ; CHECK-NEXT:    kmovw %k0, %edi
   1093 ; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k0
   1094 ; CHECK-NEXT:    kmovw %k0, %eax
   1095 ; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k0
   1096 ; CHECK-NEXT:    kmovw %k0, %ecx
   1097 ; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k0
   1098 ; CHECK-NEXT:    kmovw %k0, %edx
   1099 ; CHECK-NEXT:    vmovd %r8d, %xmm0
   1100 ; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
   1101 ; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
   1102 ; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
   1103 ; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
   1104 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
   1105 ; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
   1106 ; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
   1107 ; CHECK-NEXT:    retq
   1108   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   1109   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   1110   %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
   1111   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
   1112   %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
   1113   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
   1114   %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
   1115   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
   1116   %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
   1117   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
   1118   %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
   1119   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
   1120   %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
   1121   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
   1122   %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
   1123   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   1124   ret <8 x i16> %vec7
   1125 }
   1126 
   1127 define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   1128 ; CHECK-LABEL: test_mask_cmp_d_512:
   1129 ; CHECK:       ## BB#0:
   1130 ; CHECK-NEXT:    kmovw %edi, %k1
   1131 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
   1132 ; CHECK-NEXT:    kmovw %k0, %r8d
   1133 ; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k0 {%k1}
   1134 ; CHECK-NEXT:    kmovw %k0, %r9d
   1135 ; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k0 {%k1}
   1136 ; CHECK-NEXT:    kmovw %k0, %r10d
   1137 ; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k0 {%k1}
   1138 ; CHECK-NEXT:    kmovw %k0, %esi
   1139 ; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
   1140 ; CHECK-NEXT:    kmovw %k0, %edi
   1141 ; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k0 {%k1}
   1142 ; CHECK-NEXT:    kmovw %k0, %eax
   1143 ; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k0 {%k1}
   1144 ; CHECK-NEXT:    kmovw %k0, %ecx
   1145 ; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k0 {%k1}
   1146 ; CHECK-NEXT:    kmovw %k0, %edx
   1147 ; CHECK-NEXT:    vmovd %r8d, %xmm0
   1148 ; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
   1149 ; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
   1150 ; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
   1151 ; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
   1152 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
   1153 ; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
   1154 ; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
   1155 ; CHECK-NEXT:    retq
   1156   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
   1157   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   1158   %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
   1159   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
   1160   %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
   1161   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
   1162   %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
   1163   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
   1164   %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
   1165   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
   1166   %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
   1167   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
   1168   %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
   1169   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
   1170   %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
   1171   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   1172   ret <8 x i16> %vec7
   1173 }
   1174 
   1175 declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
   1176 
   1177 define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
   1178 ; CHECK-LABEL: test_ucmp_d_512:
   1179 ; CHECK:       ## BB#0:
   1180 ; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0
   1181 ; CHECK-NEXT:    kmovw %k0, %r8d
   1182 ; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
   1183 ; CHECK-NEXT:    kmovw %k0, %r9d
   1184 ; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k0
   1185 ; CHECK-NEXT:    kmovw %k0, %r10d
   1186 ; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k0
   1187 ; CHECK-NEXT:    kmovw %k0, %esi
   1188 ; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k0
   1189 ; CHECK-NEXT:    kmovw %k0, %edi
   1190 ; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k0
   1191 ; CHECK-NEXT:    kmovw %k0, %eax
   1192 ; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
   1193 ; CHECK-NEXT:    kmovw %k0, %ecx
   1194 ; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k0
   1195 ; CHECK-NEXT:    kmovw %k0, %edx
   1196 ; CHECK-NEXT:    vmovd %r8d, %xmm0
   1197 ; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
   1198 ; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
   1199 ; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
   1200 ; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
   1201 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
   1202 ; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
   1203 ; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
   1204 ; CHECK-NEXT:    retq
   1205   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   1206   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   1207   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
   1208   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
   1209   %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
   1210   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
   1211   %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
   1212   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
   1213   %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
   1214   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
   1215   %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
   1216   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
   1217   %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
   1218   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
   1219   %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
   1220   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   1221   ret <8 x i16> %vec7
   1222 }
   1223 
   1224 define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   1225 ; CHECK-LABEL: test_mask_ucmp_d_512:
   1226 ; CHECK:       ## BB#0:
   1227 ; CHECK-NEXT:    kmovw %edi, %k1
   1228 ; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0 {%k1}
   1229 ; CHECK-NEXT:    kmovw %k0, %r8d
   1230 ; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
   1231 ; CHECK-NEXT:    kmovw %k0, %r9d
   1232 ; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k0 {%k1}
   1233 ; CHECK-NEXT:    kmovw %k0, %r10d
   1234 ; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k0 {%k1}
   1235 ; CHECK-NEXT:    kmovw %k0, %esi
   1236 ; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k0 {%k1}
   1237 ; CHECK-NEXT:    kmovw %k0, %edi
   1238 ; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k0 {%k1}
   1239 ; CHECK-NEXT:    kmovw %k0, %eax
   1240 ; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0 {%k1}
   1241 ; CHECK-NEXT:    kmovw %k0, %ecx
   1242 ; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k0 {%k1}
   1243 ; CHECK-NEXT:    kmovw %k0, %edx
   1244 ; CHECK-NEXT:    vmovd %r8d, %xmm0
   1245 ; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
   1246 ; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
   1247 ; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
   1248 ; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
   1249 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
   1250 ; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
   1251 ; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
   1252 ; CHECK-NEXT:    retq
   1253   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
   1254   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   1255   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
   1256   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
   1257   %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
   1258   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
   1259   %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
   1260   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
   1261   %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
   1262   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
   1263   %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
   1264   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
   1265   %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
   1266   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
   1267   %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
   1268   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
   1269   ret <8 x i16> %vec7
   1270 }
   1271 
   1272 declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
   1273 
   1274 define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
   1275 ; CHECK-LABEL: test_cmp_q_512:
   1276 ; CHECK:       ## BB#0:
   1277 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
   1278 ; CHECK-NEXT:    kmovw %k0, %r8d
   1279 ; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k0
   1280 ; CHECK-NEXT:    kmovw %k0, %r9d
   1281 ; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k0
   1282 ; CHECK-NEXT:    kmovw %k0, %r10d
   1283 ; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k0
   1284 ; CHECK-NEXT:    kmovw %k0, %r11d
   1285 ; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0
   1286 ; CHECK-NEXT:    kmovw %k0, %edi
   1287 ; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k0
   1288 ; CHECK-NEXT:    kmovw %k0, %eax
   1289 ; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k0
   1290 ; CHECK-NEXT:    kmovw %k0, %ecx
   1291 ; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k0
   1292 ; CHECK-NEXT:    kmovw %k0, %edx
   1293 ; CHECK-NEXT:    movzbl %r8b, %esi
   1294 ; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
   1295 ; CHECK-NEXT:    movzbl %r9b, %esi
   1296 ; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
   1297 ; CHECK-NEXT:    movzbl %r10b, %esi
   1298 ; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
   1299 ; CHECK-NEXT:    movzbl %r11b, %esi
   1300 ; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
   1301 ; CHECK-NEXT:    movzbl %dil, %esi
   1302 ; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
   1303 ; CHECK-NEXT:    movzbl %al, %eax
   1304 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
   1305 ; CHECK-NEXT:    movzbl %cl, %eax
   1306 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
   1307 ; CHECK-NEXT:    movzbl %dl, %eax
   1308 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
   1309 ; CHECK-NEXT:    retq
   1310   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   1311   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   1312   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
   1313   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
   1314   %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
   1315   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
   1316   %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
   1317   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
   1318   %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
   1319   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
   1320   %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
   1321   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
   1322   %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
   1323   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
   1324   %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
   1325   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   1326   ret <8 x i8> %vec7
   1327 }
   1328 
   1329 define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   1330 ; CHECK-LABEL: test_mask_cmp_q_512:
   1331 ; CHECK:       ## BB#0:
   1332 ; CHECK-NEXT:    movzbl %dil, %eax
   1333 ; CHECK-NEXT:    kmovw %eax, %k1
   1334 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
   1335 ; CHECK-NEXT:    kmovw %k0, %r8d
   1336 ; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k0 {%k1}
   1337 ; CHECK-NEXT:    kmovw %k0, %r9d
   1338 ; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k0 {%k1}
   1339 ; CHECK-NEXT:    kmovw %k0, %r10d
   1340 ; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k0 {%k1}
   1341 ; CHECK-NEXT:    kmovw %k0, %r11d
   1342 ; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
   1343 ; CHECK-NEXT:    kmovw %k0, %edi
   1344 ; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k0 {%k1}
   1345 ; CHECK-NEXT:    kmovw %k0, %eax
   1346 ; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k0 {%k1}
   1347 ; CHECK-NEXT:    kmovw %k0, %ecx
   1348 ; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k0 {%k1}
   1349 ; CHECK-NEXT:    kmovw %k0, %edx
   1350 ; CHECK-NEXT:    movzbl %r8b, %esi
   1351 ; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
   1352 ; CHECK-NEXT:    movzbl %r9b, %esi
   1353 ; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
   1354 ; CHECK-NEXT:    movzbl %r10b, %esi
   1355 ; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
   1356 ; CHECK-NEXT:    movzbl %r11b, %esi
   1357 ; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
   1358 ; CHECK-NEXT:    movzbl %dil, %esi
   1359 ; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
   1360 ; CHECK-NEXT:    movzbl %al, %eax
   1361 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
   1362 ; CHECK-NEXT:    movzbl %cl, %eax
   1363 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
   1364 ; CHECK-NEXT:    movzbl %dl, %eax
   1365 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
   1366 ; CHECK-NEXT:    retq
   1367   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
   1368   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   1369   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
   1370   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
   1371   %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
   1372   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
   1373   %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
   1374   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
   1375   %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
   1376   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
   1377   %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
   1378   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
   1379   %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
   1380   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
   1381   %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
   1382   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   1383   ret <8 x i8> %vec7
   1384 }
   1385 
   1386 declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
   1387 
   1388 define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
   1389 ; CHECK-LABEL: test_ucmp_q_512:
   1390 ; CHECK:       ## BB#0:
   1391 ; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0
   1392 ; CHECK-NEXT:    kmovw %k0, %r8d
   1393 ; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
   1394 ; CHECK-NEXT:    kmovw %k0, %r9d
   1395 ; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k0
   1396 ; CHECK-NEXT:    kmovw %k0, %r10d
   1397 ; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k0
   1398 ; CHECK-NEXT:    kmovw %k0, %r11d
   1399 ; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k0
   1400 ; CHECK-NEXT:    kmovw %k0, %edi
   1401 ; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0
   1402 ; CHECK-NEXT:    kmovw %k0, %eax
   1403 ; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0
   1404 ; CHECK-NEXT:    kmovw %k0, %ecx
   1405 ; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k0
   1406 ; CHECK-NEXT:    kmovw %k0, %edx
   1407 ; CHECK-NEXT:    movzbl %r8b, %esi
   1408 ; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
   1409 ; CHECK-NEXT:    movzbl %r9b, %esi
   1410 ; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
   1411 ; CHECK-NEXT:    movzbl %r10b, %esi
   1412 ; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
   1413 ; CHECK-NEXT:    movzbl %r11b, %esi
   1414 ; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
   1415 ; CHECK-NEXT:    movzbl %dil, %esi
   1416 ; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
   1417 ; CHECK-NEXT:    movzbl %al, %eax
   1418 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
   1419 ; CHECK-NEXT:    movzbl %cl, %eax
   1420 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
   1421 ; CHECK-NEXT:    movzbl %dl, %eax
   1422 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
   1423 ; CHECK-NEXT:    retq
   1424   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   1425   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   1426   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
   1427   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
   1428   %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
   1429   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
   1430   %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
   1431   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
   1432   %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
   1433   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
   1434   %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
   1435   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
   1436   %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
   1437   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
   1438   %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
   1439   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   1440   ret <8 x i8> %vec7
   1441 }
   1442 
   1443 define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   1444 ; CHECK-LABEL: test_mask_ucmp_q_512:
   1445 ; CHECK:       ## BB#0:
   1446 ; CHECK-NEXT:    movzbl %dil, %eax
   1447 ; CHECK-NEXT:    kmovw %eax, %k1
   1448 ; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0 {%k1}
   1449 ; CHECK-NEXT:    kmovw %k0, %r8d
   1450 ; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
   1451 ; CHECK-NEXT:    kmovw %k0, %r9d
   1452 ; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k0 {%k1}
   1453 ; CHECK-NEXT:    kmovw %k0, %r10d
   1454 ; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k0 {%k1}
   1455 ; CHECK-NEXT:    kmovw %k0, %r11d
   1456 ; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k0 {%k1}
   1457 ; CHECK-NEXT:    kmovw %k0, %edi
   1458 ; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0 {%k1}
   1459 ; CHECK-NEXT:    kmovw %k0, %eax
   1460 ; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0 {%k1}
   1461 ; CHECK-NEXT:    kmovw %k0, %ecx
   1462 ; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k0 {%k1}
   1463 ; CHECK-NEXT:    kmovw %k0, %edx
   1464 ; CHECK-NEXT:    movzbl %r8b, %esi
   1465 ; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
   1466 ; CHECK-NEXT:    movzbl %r9b, %esi
   1467 ; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
   1468 ; CHECK-NEXT:    movzbl %r10b, %esi
   1469 ; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
   1470 ; CHECK-NEXT:    movzbl %r11b, %esi
   1471 ; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
   1472 ; CHECK-NEXT:    movzbl %dil, %esi
   1473 ; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
   1474 ; CHECK-NEXT:    movzbl %al, %eax
   1475 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
   1476 ; CHECK-NEXT:    movzbl %cl, %eax
   1477 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
   1478 ; CHECK-NEXT:    movzbl %dl, %eax
   1479 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
   1480 ; CHECK-NEXT:    retq
   1481   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
   1482   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   1483   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
   1484   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
   1485   %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
   1486   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
   1487   %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
   1488   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
   1489   %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
   1490   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
   1491   %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
   1492   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
   1493   %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
   1494   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
   1495   %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
   1496   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
   1497   ret <8 x i8> %vec7
   1498 }
   1499 
   1500 declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
   1501 
   1502 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
   1503 ; CHECK-LABEL: test_mask_vextractf32x4:
   1504 ; CHECK:       ## BB#0:
   1505 ; CHECK-NEXT:    kmovw %edi, %k1
   1506 ; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1}
   1507 ; CHECK-NEXT:    retq
   1508   %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
   1509   ret <4 x float> %res
   1510 }
   1511 
   1512 declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
   1513 
   1514 define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
   1515 ; CHECK-LABEL: test_mask_vextracti64x4:
   1516 ; CHECK:       ## BB#0:
   1517 ; CHECK-NEXT:    kmovw %edi, %k1
   1518 ; CHECK-NEXT:    vextracti64x4 $2, %zmm1, %ymm0 {%k1}
   1519 ; CHECK-NEXT:    retq
   1520   %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
   1521   ret <4 x i64> %res
   1522 }
   1523 
   1524 declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
   1525 
   1526 define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
   1527 ; CHECK-LABEL: test_maskz_vextracti32x4:
   1528 ; CHECK:       ## BB#0:
   1529 ; CHECK-NEXT:    kmovw %edi, %k1
   1530 ; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
   1531 ; CHECK-NEXT:    retq
   1532   %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
   1533   ret <4 x i32> %res
   1534 }
   1535 
   1536 declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
   1537 
   1538 define <4 x double> @test_vextractf64x4(<8 x double> %a) {
   1539 ; CHECK-LABEL: test_vextractf64x4:
   1540 ; CHECK:       ## BB#0:
   1541 ; CHECK-NEXT:    vextractf64x4 $2, %zmm0, %ymm0
   1542 ; CHECK-NEXT:    retq
   1543   %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
   1544   ret <4 x double> %res
   1545 }
   1546 
   1547 declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
   1548 
   1549 define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
   1550 ; CHECK-LABEL: test_x86_avx512_pslli_d:
   1551 ; CHECK:       ## BB#0:
   1552 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0
   1553 ; CHECK-NEXT:    retq
   1554   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   1555   ret <16 x i32> %res
   1556 }
   1557 
   1558 define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   1559 ; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
   1560 ; CHECK:       ## BB#0:
   1561 ; CHECK-NEXT:    kmovw %edi, %k1
   1562 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1}
   1563 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1564 ; CHECK-NEXT:    retq
   1565   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   1566   ret <16 x i32> %res
   1567 }
   1568 
   1569 define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
   1570 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
   1571 ; CHECK:       ## BB#0:
   1572 ; CHECK-NEXT:    kmovw %edi, %k1
   1573 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z}
   1574 ; CHECK-NEXT:    retq
   1575   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   1576   ret <16 x i32> %res
   1577 }
   1578 
   1579 declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
   1580 
   1581 define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
   1582 ; CHECK-LABEL: test_x86_avx512_pslli_q:
   1583 ; CHECK:       ## BB#0:
   1584 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0
   1585 ; CHECK-NEXT:    retq
   1586   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   1587   ret <8 x i64> %res
   1588 }
   1589 
   1590 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   1591 ; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
   1592 ; CHECK:       ## BB#0:
   1593 ; CHECK-NEXT:    movzbl %dil, %eax
   1594 ; CHECK-NEXT:    kmovw %eax, %k1
   1595 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1}
   1596 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1597 ; CHECK-NEXT:    retq
   1598   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   1599   ret <8 x i64> %res
   1600 }
   1601 
   1602 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
   1603 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
   1604 ; CHECK:       ## BB#0:
   1605 ; CHECK-NEXT:    movzbl %dil, %eax
   1606 ; CHECK-NEXT:    kmovw %eax, %k1
   1607 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z}
   1608 ; CHECK-NEXT:    retq
   1609   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   1610   ret <8 x i64> %res
   1611 }
   1612 
   1613 declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
   1614 
   1615 define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
   1616 ; CHECK-LABEL: test_x86_avx512_psrli_d:
   1617 ; CHECK:       ## BB#0:
   1618 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0
   1619 ; CHECK-NEXT:    retq
   1620   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   1621   ret <16 x i32> %res
   1622 }
   1623 
   1624 define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   1625 ; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
   1626 ; CHECK:       ## BB#0:
   1627 ; CHECK-NEXT:    kmovw %edi, %k1
   1628 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1}
   1629 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1630 ; CHECK-NEXT:    retq
   1631   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   1632   ret <16 x i32> %res
   1633 }
   1634 
   1635 define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
   1636 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
   1637 ; CHECK:       ## BB#0:
   1638 ; CHECK-NEXT:    kmovw %edi, %k1
   1639 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z}
   1640 ; CHECK-NEXT:    retq
   1641   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   1642   ret <16 x i32> %res
   1643 }
   1644 
   1645 declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
   1646 
   1647 define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
   1648 ; CHECK-LABEL: test_x86_avx512_psrli_q:
   1649 ; CHECK:       ## BB#0:
   1650 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0
   1651 ; CHECK-NEXT:    retq
   1652   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   1653   ret <8 x i64> %res
   1654 }
   1655 
   1656 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   1657 ; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
   1658 ; CHECK:       ## BB#0:
   1659 ; CHECK-NEXT:    movzbl %dil, %eax
   1660 ; CHECK-NEXT:    kmovw %eax, %k1
   1661 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1}
   1662 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1663 ; CHECK-NEXT:    retq
   1664   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   1665   ret <8 x i64> %res
   1666 }
   1667 
   1668 define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
   1669 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
   1670 ; CHECK:       ## BB#0:
   1671 ; CHECK-NEXT:    movzbl %dil, %eax
   1672 ; CHECK-NEXT:    kmovw %eax, %k1
   1673 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
   1674 ; CHECK-NEXT:    retq
   1675   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   1676   ret <8 x i64> %res
   1677 }
   1678 
   1679 declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
   1680 
   1681 define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
   1682 ; CHECK-LABEL: test_x86_avx512_psrai_d:
   1683 ; CHECK:       ## BB#0:
   1684 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0
   1685 ; CHECK-NEXT:    retq
   1686   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   1687   ret <16 x i32> %res
   1688 }
   1689 
   1690 define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   1691 ; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
   1692 ; CHECK:       ## BB#0:
   1693 ; CHECK-NEXT:    kmovw %edi, %k1
   1694 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1}
   1695 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1696 ; CHECK-NEXT:    retq
   1697   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   1698   ret <16 x i32> %res
   1699 }
   1700 
   1701 define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
   1702 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
   1703 ; CHECK:       ## BB#0:
   1704 ; CHECK-NEXT:    kmovw %edi, %k1
   1705 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z}
   1706 ; CHECK-NEXT:    retq
   1707   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   1708   ret <16 x i32> %res
   1709 }
   1710 
   1711 declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
   1712 
   1713 define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
   1714 ; CHECK-LABEL: test_x86_avx512_psrai_q:
   1715 ; CHECK:       ## BB#0:
   1716 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0
   1717 ; CHECK-NEXT:    retq
   1718   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   1719   ret <8 x i64> %res
   1720 }
   1721 
   1722 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   1723 ; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
   1724 ; CHECK:       ## BB#0:
   1725 ; CHECK-NEXT:    movzbl %dil, %eax
   1726 ; CHECK-NEXT:    kmovw %eax, %k1
   1727 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1}
   1728 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1729 ; CHECK-NEXT:    retq
   1730   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   1731   ret <8 x i64> %res
   1732 }
   1733 
   1734 define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
   1735 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
   1736 ; CHECK:       ## BB#0:
   1737 ; CHECK-NEXT:    movzbl %dil, %eax
   1738 ; CHECK-NEXT:    kmovw %eax, %k1
   1739 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z}
   1740 ; CHECK-NEXT:    retq
   1741   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   1742   ret <8 x i64> %res
   1743 }
   1744 
   1745 declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
   1746 
   1747 define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
   1748 ; CHECK-LABEL: test_x86_avx512_psll_d:
   1749 ; CHECK:       ## BB#0:
   1750 ; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0
   1751 ; CHECK-NEXT:    retq
   1752   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   1753   ret <16 x i32> %res
   1754 }
   1755 
   1756 define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   1757 ; CHECK-LABEL: test_x86_avx512_mask_psll_d:
   1758 ; CHECK:       ## BB#0:
   1759 ; CHECK-NEXT:    kmovw %edi, %k1
   1760 ; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm2 {%k1}
   1761 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1762 ; CHECK-NEXT:    retq
   1763   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   1764   ret <16 x i32> %res
   1765 }
   1766 
   1767 define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
   1768 ; CHECK-LABEL: test_x86_avx512_maskz_psll_d:
   1769 ; CHECK:       ## BB#0:
   1770 ; CHECK-NEXT:    kmovw %edi, %k1
   1771 ; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
   1772 ; CHECK-NEXT:    retq
   1773   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   1774   ret <16 x i32> %res
   1775 }
   1776 
   1777 declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
   1778 
   1779 define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
   1780 ; CHECK-LABEL: test_x86_avx512_psll_q:
   1781 ; CHECK:       ## BB#0:
   1782 ; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
   1783 ; CHECK-NEXT:    retq
   1784   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   1785   ret <8 x i64> %res
   1786 }
   1787 
   1788 define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   1789 ; CHECK-LABEL: test_x86_avx512_mask_psll_q:
   1790 ; CHECK:       ## BB#0:
   1791 ; CHECK-NEXT:    movzbl %dil, %eax
   1792 ; CHECK-NEXT:    kmovw %eax, %k1
   1793 ; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1}
   1794 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1795 ; CHECK-NEXT:    retq
   1796   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   1797   ret <8 x i64> %res
   1798 }
   1799 
   1800 define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
   1801 ; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
   1802 ; CHECK:       ## BB#0:
   1803 ; CHECK-NEXT:    movzbl %dil, %eax
   1804 ; CHECK-NEXT:    kmovw %eax, %k1
   1805 ; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
   1806 ; CHECK-NEXT:    retq
   1807   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   1808   ret <8 x i64> %res
   1809 }
   1810 
   1811 declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
   1812 
   1813 define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
   1814 ; CHECK-LABEL: test_x86_avx512_psrl_d:
   1815 ; CHECK:       ## BB#0:
   1816 ; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
   1817 ; CHECK-NEXT:    retq
   1818   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   1819   ret <16 x i32> %res
   1820 }
   1821 
   1822 define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   1823 ; CHECK-LABEL: test_x86_avx512_mask_psrl_d:
   1824 ; CHECK:       ## BB#0:
   1825 ; CHECK-NEXT:    kmovw %edi, %k1
   1826 ; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm2 {%k1}
   1827 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1828 ; CHECK-NEXT:    retq
   1829   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   1830   ret <16 x i32> %res
   1831 }
   1832 
   1833 define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
   1834 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_d:
   1835 ; CHECK:       ## BB#0:
   1836 ; CHECK-NEXT:    kmovw %edi, %k1
   1837 ; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
   1838 ; CHECK-NEXT:    retq
   1839   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   1840   ret <16 x i32> %res
   1841 }
   1842 
   1843 declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
   1844 
   1845 define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
   1846 ; CHECK-LABEL: test_x86_avx512_psrl_q:
   1847 ; CHECK:       ## BB#0:
   1848 ; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
   1849 ; CHECK-NEXT:    retq
   1850   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   1851   ret <8 x i64> %res
   1852 }
   1853 
   1854 define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   1855 ; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
   1856 ; CHECK:       ## BB#0:
   1857 ; CHECK-NEXT:    movzbl %dil, %eax
   1858 ; CHECK-NEXT:    kmovw %eax, %k1
   1859 ; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
   1860 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1861 ; CHECK-NEXT:    retq
   1862   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   1863   ret <8 x i64> %res
   1864 }
   1865 
   1866 define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
   1867 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
   1868 ; CHECK:       ## BB#0:
   1869 ; CHECK-NEXT:    movzbl %dil, %eax
   1870 ; CHECK-NEXT:    kmovw %eax, %k1
   1871 ; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
   1872 ; CHECK-NEXT:    retq
   1873   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   1874   ret <8 x i64> %res
   1875 }
   1876 
   1877 declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
   1878 
   1879 define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
   1880 ; CHECK-LABEL: test_x86_avx512_psra_d:
   1881 ; CHECK:       ## BB#0:
   1882 ; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
   1883 ; CHECK-NEXT:    retq
   1884   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   1885   ret <16 x i32> %res
   1886 }
   1887 
   1888 define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   1889 ; CHECK-LABEL: test_x86_avx512_mask_psra_d:
   1890 ; CHECK:       ## BB#0:
   1891 ; CHECK-NEXT:    kmovw %edi, %k1
   1892 ; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm2 {%k1}
   1893 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1894 ; CHECK-NEXT:    retq
   1895   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   1896   ret <16 x i32> %res
   1897 }
   1898 
   1899 define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
   1900 ; CHECK-LABEL: test_x86_avx512_maskz_psra_d:
   1901 ; CHECK:       ## BB#0:
   1902 ; CHECK-NEXT:    kmovw %edi, %k1
   1903 ; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
   1904 ; CHECK-NEXT:    retq
   1905   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   1906   ret <16 x i32> %res
   1907 }
   1908 
   1909 declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
   1910 
   1911 define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
   1912 ; CHECK-LABEL: test_x86_avx512_psra_q:
   1913 ; CHECK:       ## BB#0:
   1914 ; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
   1915 ; CHECK-NEXT:    retq
   1916   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   1917   ret <8 x i64> %res
   1918 }
   1919 
   1920 define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   1921 ; CHECK-LABEL: test_x86_avx512_mask_psra_q:
   1922 ; CHECK:       ## BB#0:
   1923 ; CHECK-NEXT:    movzbl %dil, %eax
   1924 ; CHECK-NEXT:    kmovw %eax, %k1
   1925 ; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1}
   1926 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1927 ; CHECK-NEXT:    retq
   1928   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   1929   ret <8 x i64> %res
   1930 }
   1931 
   1932 define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
   1933 ; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
   1934 ; CHECK:       ## BB#0:
   1935 ; CHECK-NEXT:    movzbl %dil, %eax
   1936 ; CHECK-NEXT:    kmovw %eax, %k1
   1937 ; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
   1938 ; CHECK-NEXT:    retq
   1939   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   1940   ret <8 x i64> %res
   1941 }
   1942 
   1943 declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
   1944 
   1945 define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
   1946 ; CHECK-LABEL: test_x86_avx512_psllv_d:
   1947 ; CHECK:       ## BB#0:
   1948 ; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
   1949 ; CHECK-NEXT:    retq
   1950   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   1951   ret <16 x i32> %res
   1952 }
   1953 
   1954 define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   1955 ; CHECK-LABEL: test_x86_avx512_mask_psllv_d:
   1956 ; CHECK:       ## BB#0:
   1957 ; CHECK-NEXT:    kmovw %edi, %k1
   1958 ; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
   1959 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1960 ; CHECK-NEXT:    retq
   1961   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   1962   ret <16 x i32> %res
   1963 }
   1964 
   1965 define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   1966 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_d:
   1967 ; CHECK:       ## BB#0:
   1968 ; CHECK-NEXT:    kmovw %edi, %k1
   1969 ; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   1970 ; CHECK-NEXT:    retq
   1971   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   1972   ret <16 x i32> %res
   1973 }
   1974 
   1975 declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
   1976 
   1977 define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
   1978 ; CHECK-LABEL: test_x86_avx512_psllv_q:
   1979 ; CHECK:       ## BB#0:
   1980 ; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
   1981 ; CHECK-NEXT:    retq
   1982   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   1983   ret <8 x i64> %res
   1984 }
   1985 
   1986 define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   1987 ; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
   1988 ; CHECK:       ## BB#0:
   1989 ; CHECK-NEXT:    movzbl %dil, %eax
   1990 ; CHECK-NEXT:    kmovw %eax, %k1
   1991 ; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
   1992 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1993 ; CHECK-NEXT:    retq
   1994   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   1995   ret <8 x i64> %res
   1996 }
   1997 
   1998 define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   1999 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
   2000 ; CHECK:       ## BB#0:
   2001 ; CHECK-NEXT:    movzbl %dil, %eax
   2002 ; CHECK-NEXT:    kmovw %eax, %k1
   2003 ; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   2004 ; CHECK-NEXT:    retq
   2005   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   2006   ret <8 x i64> %res
   2007 }
   2008 
   2009 declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
   2010 
   2011 
   2012 define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
   2013 ; CHECK-LABEL: test_x86_avx512_psrav_d:
   2014 ; CHECK:       ## BB#0:
   2015 ; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
   2016 ; CHECK-NEXT:    retq
   2017   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   2018   ret <16 x i32> %res
   2019 }
   2020 
   2021 define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   2022 ; CHECK-LABEL: test_x86_avx512_mask_psrav_d:
   2023 ; CHECK:       ## BB#0:
   2024 ; CHECK-NEXT:    kmovw %edi, %k1
   2025 ; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm2 {%k1}
   2026 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2027 ; CHECK-NEXT:    retq
   2028   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   2029   ret <16 x i32> %res
   2030 }
   2031 
   2032 define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   2033 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_d:
   2034 ; CHECK:       ## BB#0:
   2035 ; CHECK-NEXT:    kmovw %edi, %k1
   2036 ; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
   2037 ; CHECK-NEXT:    retq
   2038   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   2039   ret <16 x i32> %res
   2040 }
   2041 
   2042 declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
   2043 
   2044 define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
   2045 ; CHECK-LABEL: test_x86_avx512_psrav_q:
   2046 ; CHECK:       ## BB#0:
   2047 ; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
   2048 ; CHECK-NEXT:    retq
   2049   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   2050   ret <8 x i64> %res
   2051 }
   2052 
   2053 define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   2054 ; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
   2055 ; CHECK:       ## BB#0:
   2056 ; CHECK-NEXT:    movzbl %dil, %eax
   2057 ; CHECK-NEXT:    kmovw %eax, %k1
   2058 ; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1}
   2059 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2060 ; CHECK-NEXT:    retq
   2061   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   2062   ret <8 x i64> %res
   2063 }
   2064 
   2065 define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   2066 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
   2067 ; CHECK:       ## BB#0:
   2068 ; CHECK-NEXT:    movzbl %dil, %eax
   2069 ; CHECK-NEXT:    kmovw %eax, %k1
   2070 ; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
   2071 ; CHECK-NEXT:    retq
   2072   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   2073   ret <8 x i64> %res
   2074 }
   2075 
   2076 declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
   2077 
   2078 define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
   2079 ; CHECK-LABEL: test_x86_avx512_psrlv_d:
   2080 ; CHECK:       ## BB#0:
   2081 ; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
   2082 ; CHECK-NEXT:    retq
   2083   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   2084   ret <16 x i32> %res
   2085 }
   2086 
   2087 define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   2088 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_d:
   2089 ; CHECK:       ## BB#0:
   2090 ; CHECK-NEXT:    kmovw %edi, %k1
   2091 ; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
   2092 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2093 ; CHECK-NEXT:    retq
   2094   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   2095   ret <16 x i32> %res
   2096 }
   2097 
   2098 define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   2099 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d:
   2100 ; CHECK:       ## BB#0:
   2101 ; CHECK-NEXT:    kmovw %edi, %k1
   2102 ; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   2103 ; CHECK-NEXT:    retq
   2104   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   2105   ret <16 x i32> %res
   2106 }
   2107 
   2108 declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
   2109 
   2110 define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
   2111 ; CHECK-LABEL: test_x86_avx512_psrlv_q:
   2112 ; CHECK:       ## BB#0:
   2113 ; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
   2114 ; CHECK-NEXT:    retq
   2115   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   2116   ret <8 x i64> %res
   2117 }
   2118 
   2119 define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   2120 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
   2121 ; CHECK:       ## BB#0:
   2122 ; CHECK-NEXT:    movzbl %dil, %eax
   2123 ; CHECK-NEXT:    kmovw %eax, %k1
   2124 ; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
   2125 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2126 ; CHECK-NEXT:    retq
   2127   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   2128   ret <8 x i64> %res
   2129 }
   2130 
   2131 define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   2132 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
   2133 ; CHECK:       ## BB#0:
   2134 ; CHECK-NEXT:    movzbl %dil, %eax
   2135 ; CHECK-NEXT:    kmovw %eax, %k1
   2136 ; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   2137 ; CHECK-NEXT:    retq
   2138   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   2139   ret <8 x i64> %res
   2140 }
   2141 
   2142 declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
   2143 
   2144 define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
   2145 ; CHECK-LABEL: test_x86_avx512_psrlv_q_memop:
   2146 ; CHECK:       ## BB#0:
   2147 ; CHECK-NEXT:    vpsrlvq (%rdi), %zmm0, %zmm0
   2148 ; CHECK-NEXT:    retq
   2149   %b = load <8 x i64>, <8 x i64>* %ptr
   2150   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   2151   ret <8 x i64> %res
   2152 }
   2153 
   2154 declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
   2155 declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
   2156 declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
   2157 
   2158 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
   2159 ; CHECK-LABEL: test_vsubps_rn:
   2160 ; CHECK:       ## BB#0:
   2161 ; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
   2162 ; CHECK-NEXT:    retq
   2163   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
   2164                     <16 x float> zeroinitializer, i16 -1, i32 0)
   2165   ret <16 x float> %res
   2166 }
   2167 
   2168 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
   2169 ; CHECK-LABEL: test_vsubps_rd:
   2170 ; CHECK:       ## BB#0:
   2171 ; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
   2172 ; CHECK-NEXT:    retq
   2173   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
   2174                     <16 x float> zeroinitializer, i16 -1, i32 1)
   2175   ret <16 x float> %res
   2176 }
   2177 
   2178 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
   2179 ; CHECK-LABEL: test_vsubps_ru:
   2180 ; CHECK:       ## BB#0:
   2181 ; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
   2182 ; CHECK-NEXT:    retq
   2183   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
   2184                     <16 x float> zeroinitializer, i16 -1, i32 2)
   2185   ret <16 x float> %res
   2186 }
   2187 
   2188 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
   2189 ; CHECK-LABEL: test_vsubps_rz:
   2190 ; CHECK:       ## BB#0:
   2191 ; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
   2192 ; CHECK-NEXT:    retq
   2193   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
   2194                     <16 x float> zeroinitializer, i16 -1, i32 3)
   2195   ret <16 x float> %res
   2196 }
   2197 
   2198 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
   2199 ; CHECK-LABEL: test_vmulps_rn:
   2200 ; CHECK:       ## BB#0:
   2201 ; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
   2202 ; CHECK-NEXT:    retq
   2203   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2204                     <16 x float> zeroinitializer, i16 -1, i32 0)
   2205   ret <16 x float> %res
   2206 }
   2207 
   2208 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
   2209 ; CHECK-LABEL: test_vmulps_rd:
   2210 ; CHECK:       ## BB#0:
   2211 ; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
   2212 ; CHECK-NEXT:    retq
   2213   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2214                     <16 x float> zeroinitializer, i16 -1, i32 1)
   2215   ret <16 x float> %res
   2216 }
   2217 
   2218 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
   2219 ; CHECK-LABEL: test_vmulps_ru:
   2220 ; CHECK:       ## BB#0:
   2221 ; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
   2222 ; CHECK-NEXT:    retq
   2223   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2224                     <16 x float> zeroinitializer, i16 -1, i32 2)
   2225   ret <16 x float> %res
   2226 }
   2227 
   2228 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
   2229 ; CHECK-LABEL: test_vmulps_rz:
   2230 ; CHECK:       ## BB#0:
   2231 ; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
   2232 ; CHECK-NEXT:    retq
   2233   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2234                     <16 x float> zeroinitializer, i16 -1, i32 3)
   2235   ret <16 x float> %res
   2236 }
   2237 
   2238 ;; mask float
   2239 define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   2240 ; CHECK-LABEL: test_vmulps_mask_rn:
   2241 ; CHECK:       ## BB#0:
   2242 ; CHECK-NEXT:    kmovw %edi, %k1
   2243 ; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2244 ; CHECK-NEXT:    retq
   2245   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2246                     <16 x float> zeroinitializer, i16 %mask, i32 0)
   2247   ret <16 x float> %res
   2248 }
   2249 
   2250 define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   2251 ; CHECK-LABEL: test_vmulps_mask_rd:
   2252 ; CHECK:       ## BB#0:
   2253 ; CHECK-NEXT:    kmovw %edi, %k1
   2254 ; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2255 ; CHECK-NEXT:    retq
   2256   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2257                     <16 x float> zeroinitializer, i16 %mask, i32 1)
   2258   ret <16 x float> %res
   2259 }
   2260 
   2261 define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   2262 ; CHECK-LABEL: test_vmulps_mask_ru:
   2263 ; CHECK:       ## BB#0:
   2264 ; CHECK-NEXT:    kmovw %edi, %k1
   2265 ; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2266 ; CHECK-NEXT:    retq
   2267   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2268                     <16 x float> zeroinitializer, i16 %mask, i32 2)
   2269   ret <16 x float> %res
   2270 }
   2271 
   2272 define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   2273 ; CHECK-LABEL: test_vmulps_mask_rz:
   2274 ; CHECK:       ## BB#0:
   2275 ; CHECK-NEXT:    kmovw %edi, %k1
   2276 ; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2277 ; CHECK-NEXT:    retq
   2278   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2279                     <16 x float> zeroinitializer, i16 %mask, i32 3)
   2280   ret <16 x float> %res
   2281 }
   2282 
   2283 ;; With Passthru value
   2284 define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
   2285 ; CHECK-LABEL: test_vmulps_mask_passthru_rn:
   2286 ; CHECK:       ## BB#0:
   2287 ; CHECK-NEXT:    kmovw %edi, %k1
   2288 ; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   2289 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2290 ; CHECK-NEXT:    retq
   2291   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2292                     <16 x float> %passthru, i16 %mask, i32 0)
   2293   ret <16 x float> %res
   2294 }
   2295 
   2296 define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
   2297 ; CHECK-LABEL: test_vmulps_mask_passthru_rd:
   2298 ; CHECK:       ## BB#0:
   2299 ; CHECK-NEXT:    kmovw %edi, %k1
   2300 ; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   2301 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2302 ; CHECK-NEXT:    retq
   2303   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2304                     <16 x float> %passthru, i16 %mask, i32 1)
   2305   ret <16 x float> %res
   2306 }
   2307 
   2308 define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
   2309 ; CHECK-LABEL: test_vmulps_mask_passthru_ru:
   2310 ; CHECK:       ## BB#0:
   2311 ; CHECK-NEXT:    kmovw %edi, %k1
   2312 ; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   2313 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2314 ; CHECK-NEXT:    retq
   2315   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2316                     <16 x float> %passthru, i16 %mask, i32 2)
   2317   ret <16 x float> %res
   2318 }
   2319 
   2320 define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
   2321 ; CHECK-LABEL: test_vmulps_mask_passthru_rz:
   2322 ; CHECK:       ## BB#0:
   2323 ; CHECK-NEXT:    kmovw %edi, %k1
   2324 ; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   2325 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2326 ; CHECK-NEXT:    retq
   2327   %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
   2328                     <16 x float> %passthru, i16 %mask, i32 3)
   2329   ret <16 x float> %res
   2330 }
   2331 
   2332 ;; mask double
   2333 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
   2334 ; CHECK-LABEL: test_vmulpd_mask_rn:
   2335 ; CHECK:       ## BB#0:
   2336 ; CHECK-NEXT:    movzbl %dil, %eax
   2337 ; CHECK-NEXT:    kmovw %eax, %k1
   2338 ; CHECK-NEXT:    vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2339 ; CHECK-NEXT:    retq
   2340   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
   2341                     <8 x double> zeroinitializer, i8 %mask, i32 0)
   2342   ret <8 x double> %res
   2343 }
   2344 
   2345 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
   2346 ; CHECK-LABEL: test_vmulpd_mask_rd:
   2347 ; CHECK:       ## BB#0:
   2348 ; CHECK-NEXT:    movzbl %dil, %eax
   2349 ; CHECK-NEXT:    kmovw %eax, %k1
   2350 ; CHECK-NEXT:    vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2351 ; CHECK-NEXT:    retq
   2352   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
   2353                     <8 x double> zeroinitializer, i8 %mask, i32 1)
   2354   ret <8 x double> %res
   2355 }
   2356 
   2357 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
   2358 ; CHECK-LABEL: test_vmulpd_mask_ru:
   2359 ; CHECK:       ## BB#0:
   2360 ; CHECK-NEXT:    movzbl %dil, %eax
   2361 ; CHECK-NEXT:    kmovw %eax, %k1
   2362 ; CHECK-NEXT:    vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2363 ; CHECK-NEXT:    retq
   2364   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
   2365                     <8 x double> zeroinitializer, i8 %mask, i32 2)
   2366   ret <8 x double> %res
   2367 }
   2368 
   2369 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
   2370 ; CHECK-LABEL: test_vmulpd_mask_rz:
   2371 ; CHECK:       ## BB#0:
   2372 ; CHECK-NEXT:    movzbl %dil, %eax
   2373 ; CHECK-NEXT:    kmovw %eax, %k1
   2374 ; CHECK-NEXT:    vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   2375 ; CHECK-NEXT:    retq
   2376   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
   2377                     <8 x double> zeroinitializer, i8 %mask, i32 3)
   2378   ret <8 x double> %res
   2379 }
   2380 
   2381 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
   2382 ; CHECK-LABEL: test_xor_epi32:
   2383 ; CHECK:       ## BB#0:
   2384 ; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm0
   2385 ; CHECK-NEXT:    retq
   2386   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   2387   ret < 16 x i32> %res
   2388 }
   2389 
   2390 define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
   2391 ; CHECK-LABEL: test_mask_xor_epi32:
   2392 ; CHECK:       ## BB#0:
   2393 ; CHECK-NEXT:    kmovw %edi, %k1
   2394 ; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm2 {%k1}
   2395 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2396 ; CHECK-NEXT:    retq
   2397   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2398   ret < 16 x i32> %res
   2399 }
   2400 
   2401 declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   2402 
   2403 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
   2404 ; CHECK-LABEL: test_or_epi32:
   2405 ; CHECK:       ## BB#0:
   2406 ; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
   2407 ; CHECK-NEXT:    retq
   2408   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   2409   ret < 16 x i32> %res
   2410 }
   2411 
   2412 define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
   2413 ; CHECK-LABEL: test_mask_or_epi32:
   2414 ; CHECK:       ## BB#0:
   2415 ; CHECK-NEXT:    kmovw %edi, %k1
   2416 ; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm2 {%k1}
   2417 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2418 ; CHECK-NEXT:    retq
   2419   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2420   ret < 16 x i32> %res
   2421 }
   2422 
   2423 declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   2424 
   2425 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
   2426 ; CHECK-LABEL: test_and_epi32:
   2427 ; CHECK:       ## BB#0:
   2428 ; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm0
   2429 ; CHECK-NEXT:    retq
   2430   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   2431   ret < 16 x i32> %res
   2432 }
   2433 
   2434 define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
   2435 ; CHECK-LABEL: test_mask_and_epi32:
   2436 ; CHECK:       ## BB#0:
   2437 ; CHECK-NEXT:    kmovw %edi, %k1
   2438 ; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm2 {%k1}
   2439 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2440 ; CHECK-NEXT:    retq
   2441   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2442   ret < 16 x i32> %res
   2443 }
   2444 
   2445 declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   2446 
   2447 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
   2448 ; CHECK-LABEL: test_xor_epi64:
   2449 ; CHECK:       ## BB#0:
   2450 ; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
   2451 ; CHECK-NEXT:    retq
   2452   %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   2453   ret < 8 x i64> %res
   2454 }
   2455 
   2456 define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
   2457 ; CHECK-LABEL: test_mask_xor_epi64:
   2458 ; CHECK:       ## BB#0:
   2459 ; CHECK-NEXT:    movzbl %dil, %eax
   2460 ; CHECK-NEXT:    kmovw %eax, %k1
   2461 ; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1}
   2462 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2463 ; CHECK-NEXT:    retq
   2464   %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2465   ret < 8 x i64> %res
   2466 }
   2467 
   2468 declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   2469 
   2470 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
   2471 ; CHECK-LABEL: test_or_epi64:
   2472 ; CHECK:       ## BB#0:
   2473 ; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2474 ; CHECK-NEXT:    retq
   2475   %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   2476   ret < 8 x i64> %res
   2477 }
   2478 
   2479 define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
   2480 ; CHECK-LABEL: test_mask_or_epi64:
   2481 ; CHECK:       ## BB#0:
   2482 ; CHECK-NEXT:    movzbl %dil, %eax
   2483 ; CHECK-NEXT:    kmovw %eax, %k1
   2484 ; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1}
   2485 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2486 ; CHECK-NEXT:    retq
   2487   %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2488   ret < 8 x i64> %res
   2489 }
   2490 
   2491 declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   2492 
   2493 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
   2494 ; CHECK-LABEL: test_and_epi64:
   2495 ; CHECK:       ## BB#0:
   2496 ; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   2497 ; CHECK-NEXT:    retq
   2498   %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   2499   ret < 8 x i64> %res
   2500 }
   2501 
   2502 define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
   2503 ; CHECK-LABEL: test_mask_and_epi64:
   2504 ; CHECK:       ## BB#0:
   2505 ; CHECK-NEXT:    movzbl %dil, %eax
   2506 ; CHECK-NEXT:    kmovw %eax, %k1
   2507 ; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1}
   2508 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2509 ; CHECK-NEXT:    retq
   2510   %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2511   ret < 8 x i64> %res
   2512 }
   2513 
   2514 declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   2515 
   2516 
   2517 define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
   2518 ; CHECK-LABEL: test_mask_add_epi32_rr:
   2519 ; CHECK:       ## BB#0:
   2520 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   2521 ; CHECK-NEXT:    retq
   2522   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   2523   ret < 16 x i32> %res
   2524 }
   2525 
   2526 define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
   2527 ; CHECK-LABEL: test_mask_add_epi32_rrk:
   2528 ; CHECK:       ## BB#0:
   2529 ; CHECK-NEXT:    kmovw %edi, %k1
   2530 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm2 {%k1}
   2531 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2532 ; CHECK-NEXT:    retq
   2533   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2534   ret < 16 x i32> %res
   2535 }
   2536 
   2537 define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
   2538 ; CHECK-LABEL: test_mask_add_epi32_rrkz:
   2539 ; CHECK:       ## BB#0:
   2540 ; CHECK-NEXT:    kmovw %edi, %k1
   2541 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
   2542 ; CHECK-NEXT:    retq
   2543   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   2544   ret < 16 x i32> %res
   2545 }
   2546 
   2547 define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
   2548 ; CHECK-LABEL: test_mask_add_epi32_rm:
   2549 ; CHECK:       ## BB#0:
   2550 ; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
   2551 ; CHECK-NEXT:    retq
   2552   %b = load <16 x i32>, <16 x i32>* %ptr_b
   2553   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   2554   ret < 16 x i32> %res
   2555 }
   2556 
   2557 define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
   2558 ; CHECK-LABEL: test_mask_add_epi32_rmk:
   2559 ; CHECK:       ## BB#0:
   2560 ; CHECK-NEXT:    kmovw %esi, %k1
   2561 ; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm1 {%k1}
   2562 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2563 ; CHECK-NEXT:    retq
   2564   %b = load <16 x i32>, <16 x i32>* %ptr_b
   2565   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2566   ret < 16 x i32> %res
   2567 }
   2568 
   2569 define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
   2570 ; CHECK-LABEL: test_mask_add_epi32_rmkz:
   2571 ; CHECK:       ## BB#0:
   2572 ; CHECK-NEXT:    kmovw %esi, %k1
   2573 ; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
   2574 ; CHECK-NEXT:    retq
   2575   %b = load <16 x i32>, <16 x i32>* %ptr_b
   2576   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   2577   ret < 16 x i32> %res
   2578 }
   2579 
   2580 define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
   2581 ; CHECK-LABEL: test_mask_add_epi32_rmb:
   2582 ; CHECK:       ## BB#0:
   2583 ; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0
   2584 ; CHECK-NEXT:    retq
   2585   %q = load i32, i32* %ptr_b
   2586   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   2587   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   2588   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   2589   ret < 16 x i32> %res
   2590 }
   2591 
   2592 define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
   2593 ; CHECK-LABEL: test_mask_add_epi32_rmbk:
   2594 ; CHECK:       ## BB#0:
   2595 ; CHECK-NEXT:    kmovw %esi, %k1
   2596 ; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
   2597 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2598 ; CHECK-NEXT:    retq
   2599   %q = load i32, i32* %ptr_b
   2600   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   2601   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   2602   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2603   ret < 16 x i32> %res
   2604 }
   2605 
   2606 define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
   2607 ; CHECK-LABEL: test_mask_add_epi32_rmbkz:
   2608 ; CHECK:       ## BB#0:
   2609 ; CHECK-NEXT:    kmovw %esi, %k1
   2610 ; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
   2611 ; CHECK-NEXT:    retq
   2612   %q = load i32, i32* %ptr_b
   2613   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   2614   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   2615   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   2616   ret < 16 x i32> %res
   2617 }
   2618 
   2619 declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   2620 
   2621 define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
   2622 ; CHECK-LABEL: test_mask_sub_epi32_rr:
   2623 ; CHECK:       ## BB#0:
   2624 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
   2625 ; CHECK-NEXT:    retq
   2626   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   2627   ret < 16 x i32> %res
   2628 }
   2629 
   2630 define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
   2631 ; CHECK-LABEL: test_mask_sub_epi32_rrk:
   2632 ; CHECK:       ## BB#0:
   2633 ; CHECK-NEXT:    kmovw %edi, %k1
   2634 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm2 {%k1}
   2635 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2636 ; CHECK-NEXT:    retq
   2637   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2638   ret < 16 x i32> %res
   2639 }
   2640 
   2641 define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
   2642 ; CHECK-LABEL: test_mask_sub_epi32_rrkz:
   2643 ; CHECK:       ## BB#0:
   2644 ; CHECK-NEXT:    kmovw %edi, %k1
   2645 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
   2646 ; CHECK-NEXT:    retq
   2647   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   2648   ret < 16 x i32> %res
   2649 }
   2650 
   2651 define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
   2652 ; CHECK-LABEL: test_mask_sub_epi32_rm:
   2653 ; CHECK:       ## BB#0:
   2654 ; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0
   2655 ; CHECK-NEXT:    retq
   2656   %b = load <16 x i32>, <16 x i32>* %ptr_b
   2657   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   2658   ret < 16 x i32> %res
   2659 }
   2660 
   2661 define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
   2662 ; CHECK-LABEL: test_mask_sub_epi32_rmk:
   2663 ; CHECK:       ## BB#0:
   2664 ; CHECK-NEXT:    kmovw %esi, %k1
   2665 ; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm1 {%k1}
   2666 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2667 ; CHECK-NEXT:    retq
   2668   %b = load <16 x i32>, <16 x i32>* %ptr_b
   2669   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2670   ret < 16 x i32> %res
   2671 }
   2672 
   2673 define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
   2674 ; CHECK-LABEL: test_mask_sub_epi32_rmkz:
   2675 ; CHECK:       ## BB#0:
   2676 ; CHECK-NEXT:    kmovw %esi, %k1
   2677 ; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z}
   2678 ; CHECK-NEXT:    retq
   2679   %b = load <16 x i32>, <16 x i32>* %ptr_b
   2680   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   2681   ret < 16 x i32> %res
   2682 }
   2683 
   2684 define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
   2685 ; CHECK-LABEL: test_mask_sub_epi32_rmb:
   2686 ; CHECK:       ## BB#0:
   2687 ; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0
   2688 ; CHECK-NEXT:    retq
   2689   %q = load i32, i32* %ptr_b
   2690   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   2691   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   2692   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   2693   ret < 16 x i32> %res
   2694 }
   2695 
   2696 define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
   2697 ; CHECK-LABEL: test_mask_sub_epi32_rmbk:
   2698 ; CHECK:       ## BB#0:
   2699 ; CHECK-NEXT:    kmovw %esi, %k1
   2700 ; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
   2701 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2702 ; CHECK-NEXT:    retq
   2703   %q = load i32, i32* %ptr_b
   2704   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   2705   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   2706   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   2707   ret < 16 x i32> %res
   2708 }
   2709 
   2710 define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
   2711 ; CHECK-LABEL: test_mask_sub_epi32_rmbkz:
   2712 ; CHECK:       ## BB#0:
   2713 ; CHECK-NEXT:    kmovw %esi, %k1
   2714 ; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
   2715 ; CHECK-NEXT:    retq
   2716   %q = load i32, i32* %ptr_b
   2717   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   2718   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   2719   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   2720   ret < 16 x i32> %res
   2721 }
   2722 
   2723 declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   2724 
   2725 define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
   2726 ; CHECK-LABEL: test_mask_add_epi64_rr:
   2727 ; CHECK:       ## BB#0:
   2728 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
   2729 ; CHECK-NEXT:    retq
   2730   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   2731   ret < 8 x i64> %res
   2732 }
   2733 
   2734 define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
   2735 ; CHECK-LABEL: test_mask_add_epi64_rrk:
   2736 ; CHECK:       ## BB#0:
   2737 ; CHECK-NEXT:    movzbl %dil, %eax
   2738 ; CHECK-NEXT:    kmovw %eax, %k1
   2739 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm2 {%k1}
   2740 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2741 ; CHECK-NEXT:    retq
   2742   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2743   ret < 8 x i64> %res
   2744 }
   2745 
   2746 define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
   2747 ; CHECK-LABEL: test_mask_add_epi64_rrkz:
   2748 ; CHECK:       ## BB#0:
   2749 ; CHECK-NEXT:    movzbl %dil, %eax
   2750 ; CHECK-NEXT:    kmovw %eax, %k1
   2751 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
   2752 ; CHECK-NEXT:    retq
   2753   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   2754   ret < 8 x i64> %res
   2755 }
   2756 
   2757 define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
   2758 ; CHECK-LABEL: test_mask_add_epi64_rm:
   2759 ; CHECK:       ## BB#0:
   2760 ; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
   2761 ; CHECK-NEXT:    retq
   2762   %b = load <8 x i64>, <8 x i64>* %ptr_b
   2763   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   2764   ret < 8 x i64> %res
   2765 }
   2766 
   2767 define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
   2768 ; CHECK-LABEL: test_mask_add_epi64_rmk:
   2769 ; CHECK:       ## BB#0:
   2770 ; CHECK-NEXT:    movzbl %sil, %eax
   2771 ; CHECK-NEXT:    kmovw %eax, %k1
   2772 ; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm1 {%k1}
   2773 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2774 ; CHECK-NEXT:    retq
   2775   %b = load <8 x i64>, <8 x i64>* %ptr_b
   2776   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2777   ret < 8 x i64> %res
   2778 }
   2779 
   2780 define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
   2781 ; CHECK-LABEL: test_mask_add_epi64_rmkz:
   2782 ; CHECK:       ## BB#0:
   2783 ; CHECK-NEXT:    movzbl %sil, %eax
   2784 ; CHECK-NEXT:    kmovw %eax, %k1
   2785 ; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
   2786 ; CHECK-NEXT:    retq
   2787   %b = load <8 x i64>, <8 x i64>* %ptr_b
   2788   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   2789   ret < 8 x i64> %res
   2790 }
   2791 
   2792 define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
   2793 ; CHECK-LABEL: test_mask_add_epi64_rmb:
   2794 ; CHECK:       ## BB#0:
   2795 ; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
   2796 ; CHECK-NEXT:    retq
   2797   %q = load i64, i64* %ptr_b
   2798   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   2799   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   2800   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   2801   ret < 8 x i64> %res
   2802 }
   2803 
   2804 define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
   2805 ; CHECK-LABEL: test_mask_add_epi64_rmbk:
   2806 ; CHECK:       ## BB#0:
   2807 ; CHECK-NEXT:    movzbl %sil, %eax
   2808 ; CHECK-NEXT:    kmovw %eax, %k1
   2809 ; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
   2810 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2811 ; CHECK-NEXT:    retq
   2812   %q = load i64, i64* %ptr_b
   2813   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   2814   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   2815   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2816   ret < 8 x i64> %res
   2817 }
   2818 
   2819 define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
   2820 ; CHECK-LABEL: test_mask_add_epi64_rmbkz:
   2821 ; CHECK:       ## BB#0:
   2822 ; CHECK-NEXT:    movzbl %sil, %eax
   2823 ; CHECK-NEXT:    kmovw %eax, %k1
   2824 ; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
   2825 ; CHECK-NEXT:    retq
   2826   %q = load i64, i64* %ptr_b
   2827   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   2828   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   2829   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   2830   ret < 8 x i64> %res
   2831 }
   2832 
   2833 declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   2834 
   2835 define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
   2836 ; CHECK-LABEL: test_mask_sub_epi64_rr:
   2837 ; CHECK:       ## BB#0:
   2838 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
   2839 ; CHECK-NEXT:    retq
   2840   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   2841   ret < 8 x i64> %res
   2842 }
   2843 
   2844 define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
   2845 ; CHECK-LABEL: test_mask_sub_epi64_rrk:
   2846 ; CHECK:       ## BB#0:
   2847 ; CHECK-NEXT:    movzbl %dil, %eax
   2848 ; CHECK-NEXT:    kmovw %eax, %k1
   2849 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm2 {%k1}
   2850 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2851 ; CHECK-NEXT:    retq
   2852   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2853   ret < 8 x i64> %res
   2854 }
   2855 
   2856 define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
   2857 ; CHECK-LABEL: test_mask_sub_epi64_rrkz:
   2858 ; CHECK:       ## BB#0:
   2859 ; CHECK-NEXT:    movzbl %dil, %eax
   2860 ; CHECK-NEXT:    kmovw %eax, %k1
   2861 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
   2862 ; CHECK-NEXT:    retq
   2863   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   2864   ret < 8 x i64> %res
   2865 }
   2866 
   2867 define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
   2868 ; CHECK-LABEL: test_mask_sub_epi64_rm:
   2869 ; CHECK:       ## BB#0:
   2870 ; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0
   2871 ; CHECK-NEXT:    retq
   2872   %b = load <8 x i64>, <8 x i64>* %ptr_b
   2873   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   2874   ret < 8 x i64> %res
   2875 }
   2876 
   2877 define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
   2878 ; CHECK-LABEL: test_mask_sub_epi64_rmk:
   2879 ; CHECK:       ## BB#0:
   2880 ; CHECK-NEXT:    movzbl %sil, %eax
   2881 ; CHECK-NEXT:    kmovw %eax, %k1
   2882 ; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm1 {%k1}
   2883 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2884 ; CHECK-NEXT:    retq
   2885   %b = load <8 x i64>, <8 x i64>* %ptr_b
   2886   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2887   ret < 8 x i64> %res
   2888 }
   2889 
   2890 define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
   2891 ; CHECK-LABEL: test_mask_sub_epi64_rmkz:
   2892 ; CHECK:       ## BB#0:
   2893 ; CHECK-NEXT:    movzbl %sil, %eax
   2894 ; CHECK-NEXT:    kmovw %eax, %k1
   2895 ; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
   2896 ; CHECK-NEXT:    retq
   2897   %b = load <8 x i64>, <8 x i64>* %ptr_b
   2898   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   2899   ret < 8 x i64> %res
   2900 }
   2901 
   2902 define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
   2903 ; CHECK-LABEL: test_mask_sub_epi64_rmb:
   2904 ; CHECK:       ## BB#0:
   2905 ; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0
   2906 ; CHECK-NEXT:    retq
   2907   %q = load i64, i64* %ptr_b
   2908   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   2909   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   2910   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   2911   ret < 8 x i64> %res
   2912 }
   2913 
   2914 define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
   2915 ; CHECK-LABEL: test_mask_sub_epi64_rmbk:
   2916 ; CHECK:       ## BB#0:
   2917 ; CHECK-NEXT:    movzbl %sil, %eax
   2918 ; CHECK-NEXT:    kmovw %eax, %k1
   2919 ; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
   2920 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2921 ; CHECK-NEXT:    retq
   2922   %q = load i64, i64* %ptr_b
   2923   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   2924   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   2925   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   2926   ret < 8 x i64> %res
   2927 }
   2928 
   2929 define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
   2930 ; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
   2931 ; CHECK:       ## BB#0:
   2932 ; CHECK-NEXT:    movzbl %sil, %eax
   2933 ; CHECK-NEXT:    kmovw %eax, %k1
   2934 ; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
   2935 ; CHECK-NEXT:    retq
   2936   %q = load i64, i64* %ptr_b
   2937   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   2938   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   2939   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   2940   ret < 8 x i64> %res
   2941 }
   2942 
   2943 declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   2944 
   2945 define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
   2946 ; CHECK-LABEL: test_mask_mul_epi32_rr:
   2947 ; CHECK:       ## BB#0:
   2948 ; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0
   2949 ; CHECK-NEXT:    retq
   2950   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   2951   ret < 8 x i64> %res
   2952 }
   2953 
   2954 define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
   2955 ; CHECK-LABEL: test_mask_mul_epi32_rrk:
   2956 ; CHECK:       ## BB#0:
   2957 ; CHECK-NEXT:    movzbl %dil, %eax
   2958 ; CHECK-NEXT:    kmovw %eax, %k1
   2959 ; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
   2960 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   2961 ; CHECK-NEXT:    retq
   2962   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   2963   ret < 8 x i64> %res
   2964 }
   2965 
   2966 define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
   2967 ; CHECK-LABEL: test_mask_mul_epi32_rrkz:
   2968 ; CHECK:       ## BB#0:
   2969 ; CHECK-NEXT:    movzbl %dil, %eax
   2970 ; CHECK-NEXT:    kmovw %eax, %k1
   2971 ; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
   2972 ; CHECK-NEXT:    retq
   2973   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   2974   ret < 8 x i64> %res
   2975 }
   2976 
   2977 define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
   2978 ; CHECK-LABEL: test_mask_mul_epi32_rm:
   2979 ; CHECK:       ## BB#0:
   2980 ; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0
   2981 ; CHECK-NEXT:    retq
   2982   %b = load <16 x i32>, <16 x i32>* %ptr_b
   2983   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   2984   ret < 8 x i64> %res
   2985 }
   2986 
   2987 define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
   2988 ; CHECK-LABEL: test_mask_mul_epi32_rmk:
   2989 ; CHECK:       ## BB#0:
   2990 ; CHECK-NEXT:    movzbl %sil, %eax
   2991 ; CHECK-NEXT:    kmovw %eax, %k1
   2992 ; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
   2993 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   2994 ; CHECK-NEXT:    retq
   2995   %b = load <16 x i32>, <16 x i32>* %ptr_b
   2996   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   2997   ret < 8 x i64> %res
   2998 }
   2999 
   3000 define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
   3001 ; CHECK-LABEL: test_mask_mul_epi32_rmkz:
   3002 ; CHECK:       ## BB#0:
   3003 ; CHECK-NEXT:    movzbl %sil, %eax
   3004 ; CHECK-NEXT:    kmovw %eax, %k1
   3005 ; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
   3006 ; CHECK-NEXT:    retq
   3007   %b = load <16 x i32>, <16 x i32>* %ptr_b
   3008   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   3009   ret < 8 x i64> %res
   3010 }
   3011 
   3012 define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
   3013 ; CHECK-LABEL: test_mask_mul_epi32_rmb:
   3014 ; CHECK:       ## BB#0:
   3015 ; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0
   3016 ; CHECK-NEXT:    retq
   3017   %q = load i64, i64* %ptr_b
   3018   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   3019   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   3020   %b = bitcast <8 x i64> %b64 to <16 x i32>
   3021   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   3022   ret < 8 x i64> %res
   3023 }
   3024 
   3025 define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
   3026 ; CHECK-LABEL: test_mask_mul_epi32_rmbk:
   3027 ; CHECK:       ## BB#0:
   3028 ; CHECK-NEXT:    movzbl %sil, %eax
   3029 ; CHECK-NEXT:    kmovw %eax, %k1
   3030 ; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
   3031 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   3032 ; CHECK-NEXT:    retq
   3033   %q = load i64, i64* %ptr_b
   3034   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   3035   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   3036   %b = bitcast <8 x i64> %b64 to <16 x i32>
   3037   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   3038   ret < 8 x i64> %res
   3039 }
   3040 
   3041 define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
   3042 ; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
   3043 ; CHECK:       ## BB#0:
   3044 ; CHECK-NEXT:    movzbl %sil, %eax
   3045 ; CHECK-NEXT:    kmovw %eax, %k1
   3046 ; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
   3047 ; CHECK-NEXT:    retq
   3048   %q = load i64, i64* %ptr_b
   3049   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   3050   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   3051   %b = bitcast <8 x i64> %b64 to <16 x i32>
   3052   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   3053   ret < 8 x i64> %res
   3054 }
   3055 
   3056 declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
   3057 
   3058 define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
   3059 ; CHECK-LABEL: test_mask_mul_epu32_rr:
   3060 ; CHECK:       ## BB#0:
   3061 ; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
   3062 ; CHECK-NEXT:    retq
   3063   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   3064   ret < 8 x i64> %res
   3065 }
   3066 
   3067 define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
   3068 ; CHECK-LABEL: test_mask_mul_epu32_rrk:
   3069 ; CHECK:       ## BB#0:
   3070 ; CHECK-NEXT:    movzbl %dil, %eax
   3071 ; CHECK-NEXT:    kmovw %eax, %k1
   3072 ; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
   3073 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3074 ; CHECK-NEXT:    retq
   3075   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   3076   ret < 8 x i64> %res
   3077 }
   3078 
   3079 define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
   3080 ; CHECK-LABEL: test_mask_mul_epu32_rrkz:
   3081 ; CHECK:       ## BB#0:
   3082 ; CHECK-NEXT:    movzbl %dil, %eax
   3083 ; CHECK-NEXT:    kmovw %eax, %k1
   3084 ; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
   3085 ; CHECK-NEXT:    retq
   3086   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   3087   ret < 8 x i64> %res
   3088 }
   3089 
   3090 define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
   3091 ; CHECK-LABEL: test_mask_mul_epu32_rm:
   3092 ; CHECK:       ## BB#0:
   3093 ; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0
   3094 ; CHECK-NEXT:    retq
   3095   %b = load <16 x i32>, <16 x i32>* %ptr_b
   3096   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   3097   ret < 8 x i64> %res
   3098 }
   3099 
   3100 define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
   3101 ; CHECK-LABEL: test_mask_mul_epu32_rmk:
   3102 ; CHECK:       ## BB#0:
   3103 ; CHECK-NEXT:    movzbl %sil, %eax
   3104 ; CHECK-NEXT:    kmovw %eax, %k1
   3105 ; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
   3106 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   3107 ; CHECK-NEXT:    retq
   3108   %b = load <16 x i32>, <16 x i32>* %ptr_b
   3109   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   3110   ret < 8 x i64> %res
   3111 }
   3112 
   3113 define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
   3114 ; CHECK-LABEL: test_mask_mul_epu32_rmkz:
   3115 ; CHECK:       ## BB#0:
   3116 ; CHECK-NEXT:    movzbl %sil, %eax
   3117 ; CHECK-NEXT:    kmovw %eax, %k1
   3118 ; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
   3119 ; CHECK-NEXT:    retq
   3120   %b = load <16 x i32>, <16 x i32>* %ptr_b
   3121   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   3122   ret < 8 x i64> %res
   3123 }
   3124 
   3125 define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
   3126 ; CHECK-LABEL: test_mask_mul_epu32_rmb:
   3127 ; CHECK:       ## BB#0:
   3128 ; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0
   3129 ; CHECK-NEXT:    retq
   3130   %q = load i64, i64* %ptr_b
   3131   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   3132   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   3133   %b = bitcast <8 x i64> %b64 to <16 x i32>
   3134   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   3135   ret < 8 x i64> %res
   3136 }
   3137 
   3138 define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
   3139 ; CHECK-LABEL: test_mask_mul_epu32_rmbk:
   3140 ; CHECK:       ## BB#0:
   3141 ; CHECK-NEXT:    movzbl %sil, %eax
   3142 ; CHECK-NEXT:    kmovw %eax, %k1
   3143 ; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
   3144 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   3145 ; CHECK-NEXT:    retq
   3146   %q = load i64, i64* %ptr_b
   3147   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   3148   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   3149   %b = bitcast <8 x i64> %b64 to <16 x i32>
   3150   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   3151   ret < 8 x i64> %res
   3152 }
   3153 
   3154 define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
   3155 ; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
   3156 ; CHECK:       ## BB#0:
   3157 ; CHECK-NEXT:    movzbl %sil, %eax
   3158 ; CHECK-NEXT:    kmovw %eax, %k1
   3159 ; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
   3160 ; CHECK-NEXT:    retq
   3161   %q = load i64, i64* %ptr_b
   3162   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   3163   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
   3164   %b = bitcast <8 x i64> %b64 to <16 x i32>
   3165   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   3166   ret < 8 x i64> %res
   3167 }
   3168 
   3169 declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
   3170 
   3171 define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
   3172 ; CHECK-LABEL: test_mask_mullo_epi32_rr_512:
   3173 ; CHECK:       ## BB#0:
   3174 ; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
   3175 ; CHECK-NEXT:    retq
   3176   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   3177   ret <16 x i32> %res
   3178 }
   3179 
   3180 define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
   3181 ; CHECK-LABEL: test_mask_mullo_epi32_rrk_512:
   3182 ; CHECK:       ## BB#0:
   3183 ; CHECK-NEXT:    kmovw %edi, %k1
   3184 ; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm2 {%k1}
   3185 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3186 ; CHECK-NEXT:    retq
   3187   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   3188   ret < 16 x i32> %res
   3189 }
   3190 
   3191 define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
   3192 ; CHECK-LABEL: test_mask_mullo_epi32_rrkz_512:
   3193 ; CHECK:       ## BB#0:
   3194 ; CHECK-NEXT:    kmovw %edi, %k1
   3195 ; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z}
   3196 ; CHECK-NEXT:    retq
   3197   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   3198   ret < 16 x i32> %res
   3199 }
   3200 
   3201 define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
   3202 ; CHECK-LABEL: test_mask_mullo_epi32_rm_512:
   3203 ; CHECK:       ## BB#0:
   3204 ; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0
   3205 ; CHECK-NEXT:    retq
   3206   %b = load <16 x i32>, <16 x i32>* %ptr_b
   3207   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   3208   ret < 16 x i32> %res
   3209 }
   3210 
   3211 define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
   3212 ; CHECK-LABEL: test_mask_mullo_epi32_rmk_512:
   3213 ; CHECK:       ## BB#0:
   3214 ; CHECK-NEXT:    kmovw %esi, %k1
   3215 ; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm1 {%k1}
   3216 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   3217 ; CHECK-NEXT:    retq
   3218   %b = load <16 x i32>, <16 x i32>* %ptr_b
   3219   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   3220   ret < 16 x i32> %res
   3221 }
   3222 
   3223 define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
   3224 ; CHECK-LABEL: test_mask_mullo_epi32_rmkz_512:
   3225 ; CHECK:       ## BB#0:
   3226 ; CHECK-NEXT:    kmovw %esi, %k1
   3227 ; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z}
   3228 ; CHECK-NEXT:    retq
   3229   %b = load <16 x i32>, <16 x i32>* %ptr_b
   3230   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   3231   ret < 16 x i32> %res
   3232 }
   3233 
   3234 define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
   3235 ; CHECK-LABEL: test_mask_mullo_epi32_rmb_512:
   3236 ; CHECK:       ## BB#0:
   3237 ; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0
   3238 ; CHECK-NEXT:    retq
   3239   %q = load i32, i32* %ptr_b
   3240   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   3241   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   3242   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   3243   ret < 16 x i32> %res
   3244 }
   3245 
   3246 define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
   3247 ; CHECK-LABEL: test_mask_mullo_epi32_rmbk_512:
   3248 ; CHECK:       ## BB#0:
   3249 ; CHECK-NEXT:    kmovw %esi, %k1
   3250 ; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
   3251 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   3252 ; CHECK-NEXT:    retq
   3253   %q = load i32, i32* %ptr_b
   3254   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   3255   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   3256   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   3257   ret < 16 x i32> %res
   3258 }
   3259 
   3260 define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
   3261 ; CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512:
   3262 ; CHECK:       ## BB#0:
   3263 ; CHECK-NEXT:    kmovw %esi, %k1
   3264 ; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
   3265 ; CHECK-NEXT:    retq
   3266   %q = load i32, i32* %ptr_b
   3267   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   3268   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
   3269   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   3270   ret < 16 x i32> %res
   3271 }
   3272 
   3273 declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   3274 
   3275 define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3276 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
   3277 ; CHECK:       ## BB#0:
   3278 ; CHECK-NEXT:    kmovw %edi, %k1
   3279 ; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3280 ; CHECK-NEXT:    retq
   3281   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
   3282   ret <16 x float> %res
   3283 }
   3284 define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3285 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
   3286 ; CHECK:       ## BB#0:
   3287 ; CHECK-NEXT:    kmovw %edi, %k1
   3288 ; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3289 ; CHECK-NEXT:    retq
   3290   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
   3291   ret <16 x float> %res
   3292 }
   3293 define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3294 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
   3295 ; CHECK:       ## BB#0:
   3296 ; CHECK-NEXT:    kmovw %edi, %k1
   3297 ; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3298 ; CHECK-NEXT:    retq
   3299   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
   3300   ret <16 x float> %res
   3301 }
   3302 
   3303 define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3304 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
   3305 ; CHECK:       ## BB#0:
   3306 ; CHECK-NEXT:    kmovw %edi, %k1
   3307 ; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3308 ; CHECK-NEXT:    retq
   3309   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
   3310   ret <16 x float> %res
   3311 }
   3312 
   3313 
   3314 define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3315 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_current:
   3316 ; CHECK:       ## BB#0:
   3317 ; CHECK-NEXT:    kmovw %edi, %k1
   3318 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
   3319 ; CHECK-NEXT:    retq
   3320   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
   3321   ret <16 x float> %res
   3322 }
   3323 
   3324 define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3325 ; CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae:
   3326 ; CHECK:       ## BB#0:
   3327 ; CHECK-NEXT:    kmovw %edi, %k1
   3328 ; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3329 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3330 ; CHECK-NEXT:    retq
   3331   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
   3332   ret <16 x float> %res
   3333 }
   3334 define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3335 ; CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae:
   3336 ; CHECK:       ## BB#0:
   3337 ; CHECK-NEXT:    kmovw %edi, %k1
   3338 ; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3339 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3340 ; CHECK-NEXT:    retq
   3341   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
   3342   ret <16 x float> %res
   3343 }
   3344 define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3345 ; CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae:
   3346 ; CHECK:       ## BB#0:
   3347 ; CHECK-NEXT:    kmovw %edi, %k1
   3348 ; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3349 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3350 ; CHECK-NEXT:    retq
   3351   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
   3352   ret <16 x float> %res
   3353 }
   3354 
   3355 define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3356 ; CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae:
   3357 ; CHECK:       ## BB#0:
   3358 ; CHECK-NEXT:    kmovw %edi, %k1
   3359 ; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3360 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3361 ; CHECK-NEXT:    retq
   3362   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
   3363   ret <16 x float> %res
   3364 }
   3365 
   3366 
   3367 define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3368 ; CHECK-LABEL: test_mm512_mask_add_round_ps_current:
   3369 ; CHECK:       ## BB#0:
   3370 ; CHECK-NEXT:    kmovw %edi, %k1
   3371 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm2 {%k1}
   3372 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3373 ; CHECK-NEXT:    retq
   3374   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   3375   ret <16 x float> %res
   3376 }
   3377 
   3378 
   3379 define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3380 ; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
   3381 ; CHECK:       ## BB#0:
   3382 ; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
   3383 ; CHECK-NEXT:    retq
   3384   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
   3385   ret <16 x float> %res
   3386 }
   3387 define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3388 ; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
   3389 ; CHECK:       ## BB#0:
   3390 ; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
   3391 ; CHECK-NEXT:    retq
   3392   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
   3393   ret <16 x float> %res
   3394 }
   3395 define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3396 ; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
   3397 ; CHECK:       ## BB#0:
   3398 ; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
   3399 ; CHECK-NEXT:    retq
   3400   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
   3401   ret <16 x float> %res
   3402 }
   3403 
   3404 define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3405 ; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
   3406 ; CHECK:       ## BB#0:
   3407 ; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
   3408 ; CHECK-NEXT:    retq
   3409   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
   3410   ret <16 x float> %res
   3411 }
   3412 
   3413 define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3414 ; CHECK-LABEL: test_mm512_add_round_ps_current:
   3415 ; CHECK:       ## BB#0:
   3416 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   3417 ; CHECK-NEXT:    retq
   3418   %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   3419   ret <16 x float> %res
   3420 }
   3421 declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
   3422 
   3423 define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3424 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
   3425 ; CHECK:       ## BB#0:
   3426 ; CHECK-NEXT:    kmovw %edi, %k1
   3427 ; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3428 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3429 ; CHECK-NEXT:    retq
   3430   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
   3431   ret <16 x float> %res
   3432 }
   3433 define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3434 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
   3435 ; CHECK:       ## BB#0:
   3436 ; CHECK-NEXT:    kmovw %edi, %k1
   3437 ; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3438 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3439 ; CHECK-NEXT:    retq
   3440   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
   3441   ret <16 x float> %res
   3442 }
   3443 define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3444 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
   3445 ; CHECK:       ## BB#0:
   3446 ; CHECK-NEXT:    kmovw %edi, %k1
   3447 ; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3448 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3449 ; CHECK-NEXT:    retq
   3450   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
   3451   ret <16 x float> %res
   3452 }
   3453 
   3454 define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3455 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
   3456 ; CHECK:       ## BB#0:
   3457 ; CHECK-NEXT:    kmovw %edi, %k1
   3458 ; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3459 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3460 ; CHECK-NEXT:    retq
   3461   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
   3462   ret <16 x float> %res
   3463 }
   3464 
   3465 
   3466 define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3467 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_current:
   3468 ; CHECK:       ## BB#0:
   3469 ; CHECK-NEXT:    kmovw %edi, %k1
   3470 ; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm2 {%k1}
   3471 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3472 ; CHECK-NEXT:    retq
   3473   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   3474   ret <16 x float> %res
   3475 }
   3476 
   3477 define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3478 ; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
   3479 ; CHECK:       ## BB#0:
   3480 ; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
   3481 ; CHECK-NEXT:    retq
   3482   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
   3483   ret <16 x float> %res
   3484 }
   3485 define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3486 ; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
   3487 ; CHECK:       ## BB#0:
   3488 ; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
   3489 ; CHECK-NEXT:    retq
   3490   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
   3491   ret <16 x float> %res
   3492 }
   3493 define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3494 ; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
   3495 ; CHECK:       ## BB#0:
   3496 ; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
   3497 ; CHECK-NEXT:    retq
   3498   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
   3499   ret <16 x float> %res
   3500 }
   3501 
   3502 define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3503 ; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
   3504 ; CHECK:       ## BB#0:
   3505 ; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
   3506 ; CHECK-NEXT:    retq
   3507   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
   3508   ret <16 x float> %res
   3509 }
   3510 
   3511 define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3512 ; CHECK-LABEL: test_mm512_sub_round_ps_current:
   3513 ; CHECK:       ## BB#0:
   3514 ; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm0
   3515 ; CHECK-NEXT:    retq
   3516   %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   3517   ret <16 x float> %res
   3518 }
   3519 
   3520 define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3521 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
   3522 ; CHECK:       ## BB#0:
   3523 ; CHECK-NEXT:    kmovw %edi, %k1
   3524 ; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3525 ; CHECK-NEXT:    retq
   3526   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
   3527   ret <16 x float> %res
   3528 }
   3529 define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3530 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
   3531 ; CHECK:       ## BB#0:
   3532 ; CHECK-NEXT:    kmovw %edi, %k1
   3533 ; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3534 ; CHECK-NEXT:    retq
   3535   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
   3536   ret <16 x float> %res
   3537 }
   3538 define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3539 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
   3540 ; CHECK:       ## BB#0:
   3541 ; CHECK-NEXT:    kmovw %edi, %k1
   3542 ; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3543 ; CHECK-NEXT:    retq
   3544   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
   3545   ret <16 x float> %res
   3546 }
   3547 
   3548 define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3549 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
   3550 ; CHECK:       ## BB#0:
   3551 ; CHECK-NEXT:    kmovw %edi, %k1
   3552 ; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3553 ; CHECK-NEXT:    retq
   3554   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
   3555   ret <16 x float> %res
   3556 }
   3557 
   3558 
   3559 define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3560 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_current:
   3561 ; CHECK:       ## BB#0:
   3562 ; CHECK-NEXT:    kmovw %edi, %k1
   3563 ; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
   3564 ; CHECK-NEXT:    retq
   3565   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
   3566   ret <16 x float> %res
   3567 }
   3568 
   3569 define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3570 ; CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae:
   3571 ; CHECK:       ## BB#0:
   3572 ; CHECK-NEXT:    kmovw %edi, %k1
   3573 ; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3574 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3575 ; CHECK-NEXT:    retq
   3576   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
   3577   ret <16 x float> %res
   3578 }
   3579 define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3580 ; CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae:
   3581 ; CHECK:       ## BB#0:
   3582 ; CHECK-NEXT:    kmovw %edi, %k1
   3583 ; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3584 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3585 ; CHECK-NEXT:    retq
   3586   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
   3587   ret <16 x float> %res
   3588 }
   3589 define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3590 ; CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae:
   3591 ; CHECK:       ## BB#0:
   3592 ; CHECK-NEXT:    kmovw %edi, %k1
   3593 ; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3594 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3595 ; CHECK-NEXT:    retq
   3596   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
   3597   ret <16 x float> %res
   3598 }
   3599 
   3600 define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3601 ; CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae:
   3602 ; CHECK:       ## BB#0:
   3603 ; CHECK-NEXT:    kmovw %edi, %k1
   3604 ; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3605 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3606 ; CHECK-NEXT:    retq
   3607   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
   3608   ret <16 x float> %res
   3609 }
   3610 
   3611 
   3612 define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3613 ; CHECK-LABEL: test_mm512_mask_div_round_ps_current:
   3614 ; CHECK:       ## BB#0:
   3615 ; CHECK-NEXT:    kmovw %edi, %k1
   3616 ; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm2 {%k1}
   3617 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3618 ; CHECK-NEXT:    retq
   3619   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   3620   ret <16 x float> %res
   3621 }
   3622 
   3623 
   3624 define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3625 ; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
   3626 ; CHECK:       ## BB#0:
   3627 ; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
   3628 ; CHECK-NEXT:    retq
   3629   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
   3630   ret <16 x float> %res
   3631 }
   3632 define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3633 ; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
   3634 ; CHECK:       ## BB#0:
   3635 ; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
   3636 ; CHECK-NEXT:    retq
   3637   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
   3638   ret <16 x float> %res
   3639 }
   3640 define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3641 ; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
   3642 ; CHECK:       ## BB#0:
   3643 ; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
   3644 ; CHECK-NEXT:    retq
   3645   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
   3646   ret <16 x float> %res
   3647 }
   3648 
   3649 define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3650 ; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
   3651 ; CHECK:       ## BB#0:
   3652 ; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
   3653 ; CHECK-NEXT:    retq
   3654   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
   3655   ret <16 x float> %res
   3656 }
   3657 
   3658 define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3659 ; CHECK-LABEL: test_mm512_div_round_ps_current:
   3660 ; CHECK:       ## BB#0:
   3661 ; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0
   3662 ; CHECK-NEXT:    retq
   3663   %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   3664   ret <16 x float> %res
   3665 }
   3666 declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
   3667 
   3668 define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3669 ; CHECK-LABEL: test_mm512_maskz_min_round_ps_sae:
   3670 ; CHECK:       ## BB#0:
   3671 ; CHECK-NEXT:    kmovw %edi, %k1
   3672 ; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3673 ; CHECK-NEXT:    retq
   3674   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
   3675   ret <16 x float> %res
   3676 }
   3677 
   3678 define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3679 ; CHECK-LABEL: test_mm512_maskz_min_round_ps_current:
   3680 ; CHECK:       ## BB#0:
   3681 ; CHECK-NEXT:    kmovw %edi, %k1
   3682 ; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
   3683 ; CHECK-NEXT:    retq
   3684   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
   3685   ret <16 x float> %res
   3686 }
   3687 
   3688 define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3689 ; CHECK-LABEL: test_mm512_mask_min_round_ps_sae:
   3690 ; CHECK:       ## BB#0:
   3691 ; CHECK-NEXT:    kmovw %edi, %k1
   3692 ; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3693 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3694 ; CHECK-NEXT:    retq
   3695   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
   3696   ret <16 x float> %res
   3697 }
   3698 
   3699 define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3700 ; CHECK-LABEL: test_mm512_mask_min_round_ps_current:
   3701 ; CHECK:       ## BB#0:
   3702 ; CHECK-NEXT:    kmovw %edi, %k1
   3703 ; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm2 {%k1}
   3704 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3705 ; CHECK-NEXT:    retq
   3706   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   3707   ret <16 x float> %res
   3708 }
   3709 
   3710 define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3711 ; CHECK-LABEL: test_mm512_min_round_ps_sae:
   3712 ; CHECK:       ## BB#0:
   3713 ; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0
   3714 ; CHECK-NEXT:    retq
   3715   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
   3716   ret <16 x float> %res
   3717 }
   3718 
   3719 define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3720 ; CHECK-LABEL: test_mm512_min_round_ps_current:
   3721 ; CHECK:       ## BB#0:
   3722 ; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
   3723 ; CHECK-NEXT:    retq
   3724   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   3725   ret <16 x float> %res
   3726 }
   3727 declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
   3728 
   3729 define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3730 ; CHECK-LABEL: test_mm512_maskz_max_round_ps_sae:
   3731 ; CHECK:       ## BB#0:
   3732 ; CHECK-NEXT:    kmovw %edi, %k1
   3733 ; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   3734 ; CHECK-NEXT:    retq
   3735   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
   3736   ret <16 x float> %res
   3737 }
   3738 
   3739 define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3740 ; CHECK-LABEL: test_mm512_maskz_max_round_ps_current:
   3741 ; CHECK:       ## BB#0:
   3742 ; CHECK-NEXT:    kmovw %edi, %k1
   3743 ; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
   3744 ; CHECK-NEXT:    retq
   3745   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
   3746   ret <16 x float> %res
   3747 }
   3748 
   3749 define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3750 ; CHECK-LABEL: test_mm512_mask_max_round_ps_sae:
   3751 ; CHECK:       ## BB#0:
   3752 ; CHECK-NEXT:    kmovw %edi, %k1
   3753 ; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3754 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3755 ; CHECK-NEXT:    retq
   3756   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
   3757   ret <16 x float> %res
   3758 }
   3759 
   3760 define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   3761 ; CHECK-LABEL: test_mm512_mask_max_round_ps_current:
   3762 ; CHECK:       ## BB#0:
   3763 ; CHECK-NEXT:    kmovw %edi, %k1
   3764 ; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm2 {%k1}
   3765 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3766 ; CHECK-NEXT:    retq
   3767   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
   3768   ret <16 x float> %res
   3769 }
   3770 
   3771 define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3772 ; CHECK-LABEL: test_mm512_max_round_ps_sae:
   3773 ; CHECK:       ## BB#0:
   3774 ; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0
   3775 ; CHECK-NEXT:    retq
   3776   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
   3777   ret <16 x float> %res
   3778 }
   3779 
   3780 define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   3781 ; CHECK-LABEL: test_mm512_max_round_ps_current:
   3782 ; CHECK:       ## BB#0:
   3783 ; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
   3784 ; CHECK-NEXT:    retq
   3785   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
   3786   ret <16 x float> %res
   3787 }
   3788 declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
   3789 
   3790 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
   3791 
   3792 define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   3793 ; CHECK-LABEL: test_mask_add_ss_rn:
   3794 ; CHECK:       ## BB#0:
   3795 ; CHECK-NEXT:    andl $1, %edi
   3796 ; CHECK-NEXT:    kmovw %edi, %k1
   3797 ; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3798 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3799 ; CHECK-NEXT:    retq
   3800   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 0)
   3801   ret <4 x float> %res
   3802 }
   3803 
   3804 define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   3805 ; CHECK-LABEL: test_mask_add_ss_rd:
   3806 ; CHECK:       ## BB#0:
   3807 ; CHECK-NEXT:    andl $1, %edi
   3808 ; CHECK-NEXT:    kmovw %edi, %k1
   3809 ; CHECK-NEXT:    vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3810 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3811 ; CHECK-NEXT:    retq
   3812   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
   3813   ret <4 x float> %res
   3814 }
   3815 
   3816 define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   3817 ; CHECK-LABEL: test_mask_add_ss_ru:
   3818 ; CHECK:       ## BB#0:
   3819 ; CHECK-NEXT:    andl $1, %edi
   3820 ; CHECK-NEXT:    kmovw %edi, %k1
   3821 ; CHECK-NEXT:    vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3822 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3823 ; CHECK-NEXT:    retq
   3824   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 2)
   3825   ret <4 x float> %res
   3826 }
   3827 
   3828 define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   3829 ; CHECK-LABEL: test_mask_add_ss_rz:
   3830 ; CHECK:       ## BB#0:
   3831 ; CHECK-NEXT:    andl $1, %edi
   3832 ; CHECK-NEXT:    kmovw %edi, %k1
   3833 ; CHECK-NEXT:    vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3834 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3835 ; CHECK-NEXT:    retq
   3836   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 3)
   3837   ret <4 x float> %res
   3838 }
   3839 
   3840 define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   3841 ; CHECK-LABEL: test_mask_add_ss_current:
   3842 ; CHECK:       ## BB#0:
   3843 ; CHECK-NEXT:    andl $1, %edi
   3844 ; CHECK-NEXT:    kmovw %edi, %k1
   3845 ; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
   3846 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3847 ; CHECK-NEXT:    retq
   3848   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   3849   ret <4 x float> %res
   3850 }
   3851 
   3852 define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
   3853 ; CHECK-LABEL: test_maskz_add_ss_rn:
   3854 ; CHECK:       ## BB#0:
   3855 ; CHECK-NEXT:    andl $1, %edi
   3856 ; CHECK-NEXT:    kmovw %edi, %k1
   3857 ; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   3858 ; CHECK-NEXT:    retq
   3859   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 0)
   3860   ret <4 x float> %res
   3861 }
   3862 
   3863 define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
   3864 ; CHECK-LABEL: test_add_ss_rn:
   3865 ; CHECK:       ## BB#0:
   3866 ; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
   3867 ; CHECK-NEXT:    retq
   3868   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 0)
   3869   ret <4 x float> %res
   3870 }
   3871 
   3872 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
   3873 
   3874 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   3875 ; CHECK-LABEL: test_mask_add_sd_rn:
   3876 ; CHECK:       ## BB#0:
   3877 ; CHECK-NEXT:    andl $1, %edi
   3878 ; CHECK-NEXT:    kmovw %edi, %k1
   3879 ; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3880 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3881 ; CHECK-NEXT:    retq
   3882   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0)
   3883   ret <2 x double> %res
   3884 }
   3885 
   3886 define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   3887 ; CHECK-LABEL: test_mask_add_sd_rd:
   3888 ; CHECK:       ## BB#0:
   3889 ; CHECK-NEXT:    andl $1, %edi
   3890 ; CHECK-NEXT:    kmovw %edi, %k1
   3891 ; CHECK-NEXT:    vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3892 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3893 ; CHECK-NEXT:    retq
   3894   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
   3895   ret <2 x double> %res
   3896 }
   3897 
   3898 define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   3899 ; CHECK-LABEL: test_mask_add_sd_ru:
   3900 ; CHECK:       ## BB#0:
   3901 ; CHECK-NEXT:    andl $1, %edi
   3902 ; CHECK-NEXT:    kmovw %edi, %k1
   3903 ; CHECK-NEXT:    vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3904 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3905 ; CHECK-NEXT:    retq
   3906   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2)
   3907   ret <2 x double> %res
   3908 }
   3909 
   3910 define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   3911 ; CHECK-LABEL: test_mask_add_sd_rz:
   3912 ; CHECK:       ## BB#0:
   3913 ; CHECK-NEXT:    andl $1, %edi
   3914 ; CHECK-NEXT:    kmovw %edi, %k1
   3915 ; CHECK-NEXT:    vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3916 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3917 ; CHECK-NEXT:    retq
   3918   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3)
   3919   ret <2 x double> %res
   3920 }
   3921 
   3922 define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   3923 ; CHECK-LABEL: test_mask_add_sd_current:
   3924 ; CHECK:       ## BB#0:
   3925 ; CHECK-NEXT:    andl $1, %edi
   3926 ; CHECK-NEXT:    kmovw %edi, %k1
   3927 ; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
   3928 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3929 ; CHECK-NEXT:    retq
   3930   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   3931   ret <2 x double> %res
   3932 }
   3933 
   3934 define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
   3935 ; CHECK-LABEL: test_maskz_add_sd_rn:
   3936 ; CHECK:       ## BB#0:
   3937 ; CHECK-NEXT:    andl $1, %edi
   3938 ; CHECK-NEXT:    kmovw %edi, %k1
   3939 ; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   3940 ; CHECK-NEXT:    retq
   3941   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 0)
   3942   ret <2 x double> %res
   3943 }
   3944 
   3945 define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
   3946 ; CHECK-LABEL: test_add_sd_rn:
   3947 ; CHECK:       ## BB#0:
   3948 ; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
   3949 ; CHECK-NEXT:    retq
   3950   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 0)
   3951   ret <2 x double> %res
   3952 }
   3953 
   3954 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
   3955 
   3956 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   3957 ; CHECK-LABEL: test_mask_max_ss_sae:
   3958 ; CHECK:       ## BB#0:
   3959 ; CHECK-NEXT:    andl $1, %edi
   3960 ; CHECK-NEXT:    kmovw %edi, %k1
   3961 ; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3962 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3963 ; CHECK-NEXT:    retq
   3964   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
   3965   ret <4 x float> %res
   3966 }
   3967 
   3968 define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
   3969 ; CHECK-LABEL: test_maskz_max_ss_sae:
   3970 ; CHECK:       ## BB#0:
   3971 ; CHECK-NEXT:    andl $1, %edi
   3972 ; CHECK-NEXT:    kmovw %edi, %k1
   3973 ; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   3974 ; CHECK-NEXT:    retq
   3975   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
   3976   ret <4 x float> %res
   3977 }
   3978 
   3979 define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
   3980 ; CHECK-LABEL: test_max_ss_sae:
   3981 ; CHECK:       ## BB#0:
   3982 ; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0
   3983 ; CHECK-NEXT:    retq
   3984   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
   3985   ret <4 x float> %res
   3986 }
   3987 
   3988 define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   3989 ; CHECK-LABEL: test_mask_max_ss:
   3990 ; CHECK:       ## BB#0:
   3991 ; CHECK-NEXT:    andl $1, %edi
   3992 ; CHECK-NEXT:    kmovw %edi, %k1
   3993 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm2 {%k1}
   3994 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3995 ; CHECK-NEXT:    retq
   3996   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   3997   ret <4 x float> %res
   3998 }
   3999 
   4000 define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
   4001 ; CHECK-LABEL: test_maskz_max_ss:
   4002 ; CHECK:       ## BB#0:
   4003 ; CHECK-NEXT:    andl $1, %edi
   4004 ; CHECK-NEXT:    kmovw %edi, %k1
   4005 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
   4006 ; CHECK-NEXT:    retq
   4007   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
   4008   ret <4 x float> %res
   4009 }
   4010 
   4011 define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
   4012 ; CHECK-LABEL: test_max_ss:
   4013 ; CHECK:       ## BB#0:
   4014 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
   4015 ; CHECK-NEXT:    retq
   4016   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
   4017   ret <4 x float> %res
   4018 }
   4019 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
   4020 
   4021 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   4022 ; CHECK-LABEL: test_mask_max_sd_sae:
   4023 ; CHECK:       ## BB#0:
   4024 ; CHECK-NEXT:    andl $1, %edi
   4025 ; CHECK-NEXT:    kmovw %edi, %k1
   4026 ; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   4027 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   4028 ; CHECK-NEXT:    retq
   4029   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
   4030   ret <2 x double> %res
   4031 }
   4032 
   4033 define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
   4034 ; CHECK-LABEL: test_maskz_max_sd_sae:
   4035 ; CHECK:       ## BB#0:
   4036 ; CHECK-NEXT:    andl $1, %edi
   4037 ; CHECK-NEXT:    kmovw %edi, %k1
   4038 ; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   4039 ; CHECK-NEXT:    retq
   4040   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
   4041   ret <2 x double> %res
   4042 }
   4043 
   4044 define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
   4045 ; CHECK-LABEL: test_max_sd_sae:
   4046 ; CHECK:       ## BB#0:
   4047 ; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0
   4048 ; CHECK-NEXT:    retq
   4049   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
   4050   ret <2 x double> %res
   4051 }
   4052 
   4053 define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   4054 ; CHECK-LABEL: test_mask_max_sd:
   4055 ; CHECK:       ## BB#0:
   4056 ; CHECK-NEXT:    andl $1, %edi
   4057 ; CHECK-NEXT:    kmovw %edi, %k1
   4058 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
   4059 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   4060 ; CHECK-NEXT:    retq
   4061   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   4062   ret <2 x double> %res
   4063 }
   4064 
   4065 define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
   4066 ; CHECK-LABEL: test_maskz_max_sd:
   4067 ; CHECK:       ## BB#0:
   4068 ; CHECK-NEXT:    andl $1, %edi
   4069 ; CHECK-NEXT:    kmovw %edi, %k1
   4070 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   4071 ; CHECK-NEXT:    retq
   4072   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
   4073   ret <2 x double> %res
   4074 }
   4075 
   4076 define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
   4077 ; CHECK-LABEL: test_max_sd:
   4078 ; CHECK:       ## BB#0:
   4079 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
   4080 ; CHECK-NEXT:    retq
   4081   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
   4082   ret <2 x double> %res
   4083 }
   4084 
   4085 define <2 x double> @test_x86_avx512_cvtsi2sd32(<2 x double> %a, i32 %b) {
   4086 ; CHECK-LABEL: test_x86_avx512_cvtsi2sd32:
   4087 ; CHECK:       ## BB#0:
   4088 ; CHECK-NEXT:    vcvtsi2sdl %edi, {rz-sae}, %xmm0, %xmm0
   4089 ; CHECK-NEXT:    retq
   4090   %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd32(<2 x double> %a, i32 %b, i32 3) ; <<<2 x double>> [#uses=1]
   4091   ret <2 x double> %res
   4092 }
   4093 declare <2 x double> @llvm.x86.avx512.cvtsi2sd32(<2 x double>, i32, i32) nounwind readnone
   4094 
   4095 define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) {
   4096 ; CHECK-LABEL: test_x86_avx512_cvtsi2sd64:
   4097 ; CHECK:       ## BB#0:
   4098 ; CHECK-NEXT:    vcvtsi2sdq %rdi, {rz-sae}, %xmm0, %xmm0
   4099 ; CHECK-NEXT:    retq
   4100   %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double> %a, i64 %b, i32 3) ; <<<2 x double>> [#uses=1]
   4101   ret <2 x double> %res
   4102 }
   4103 declare <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double>, i64, i32) nounwind readnone
   4104 
   4105 define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
   4106 ; CHECK-LABEL: test_x86_avx512_cvtsi2ss32:
   4107 ; CHECK:       ## BB#0:
   4108 ; CHECK-NEXT:    vcvtsi2ssl %edi, {rz-sae}, %xmm0, %xmm0
   4109 ; CHECK-NEXT:    retq
   4110   %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 3) ; <<<4 x float>> [#uses=1]
   4111   ret <4 x float> %res
   4112 }
   4113 declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
   4114 
   4115 define <4 x float> @test_x86_avx512_cvtsi2ss64(<4 x float> %a, i64 %b) {
   4116 ; CHECK-LABEL: test_x86_avx512_cvtsi2ss64:
   4117 ; CHECK:       ## BB#0:
   4118 ; CHECK-NEXT:    vcvtsi2ssq %rdi, {rz-sae}, %xmm0, %xmm0
   4119 ; CHECK-NEXT:    retq
   4120   %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float> %a, i64 %b, i32 3) ; <<<4 x float>> [#uses=1]
   4121   ret <4 x float> %res
   4122 }
   4123 declare <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float>, i64, i32) nounwind readnone
   4124 
   4125 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
   4126 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
   4127 ; CHECK:       ## BB#0:
   4128 ; CHECK-NEXT:    vcvtusi2ssl %edi, {rd-sae}, %xmm0, %xmm0
   4129 ; CHECK-NEXT:    retq
   4130 {
   4131   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
   4132   ret <4 x float> %res
   4133 }
   4134 
   4135 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr)
   4136 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
   4137 ; CHECK:       ## BB#0:
   4138 ; CHECK-NEXT:    movl (%rdi), %eax
   4139 ; CHECK-NEXT:    vcvtusi2ssl %eax, {rd-sae}, %xmm0, %xmm0
   4140 ; CHECK-NEXT:    retq
   4141 {
   4142   %b = load i32, i32* %ptr
   4143   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
   4144   ret <4 x float> %res
   4145 }
   4146 
   4147 define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b)
   4148 ; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss:
   4149 ; CHECK:       ## BB#0:
   4150 ; CHECK-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
   4151 ; CHECK-NEXT:    retq
   4152 {
   4153   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
   4154   ret <4 x float> %res
   4155 }
   4156 
   4157 define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr)
   4158 ; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
   4159 ; CHECK:       ## BB#0:
   4160 ; CHECK-NEXT:    vcvtusi2ssl (%rdi), %xmm0, %xmm0
   4161 ; CHECK-NEXT:    retq
   4162 {
   4163   %b = load i32, i32* %ptr
   4164   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
   4165   ret <4 x float> %res
   4166 }
   4167 declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
   4168 
   4169 define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b)
   4170 ; CHECK-LABEL: _mm_cvt_roundu64_ss:
   4171 ; CHECK:       ## BB#0:
   4172 ; CHECK-NEXT:    vcvtusi2ssq %rdi, {rd-sae}, %xmm0, %xmm0
   4173 ; CHECK-NEXT:    retq
   4174 {
   4175   %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 1) ; <<<4 x float>> [#uses=1]
   4176   ret <4 x float> %res
   4177 }
   4178 
   4179 define <4 x float> @_mm_cvtu64_ss(<4 x float> %a, i64 %b)
   4180 ; CHECK-LABEL: _mm_cvtu64_ss:
   4181 ; CHECK:       ## BB#0:
   4182 ; CHECK-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0
   4183 ; CHECK-NEXT:    retq
   4184 {
   4185   %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 4) ; <<<4 x float>> [#uses=1]
   4186   ret <4 x float> %res
   4187 }
   4188 declare <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float>, i64, i32) nounwind readnone
   4189 
   4190 define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
   4191 ; CHECK-LABEL: test_x86_avx512_mm_cvtu32_sd:
   4192 ; CHECK:       ## BB#0:
   4193 ; CHECK-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
   4194 ; CHECK-NEXT:    retq
   4195 {
   4196   %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1]
   4197   ret <2 x double> %res
   4198 }
   4199 declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone
   4200 
   4201 define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b)
   4202 ; CHECK-LABEL: test_x86_avx512_mm_cvtu64_sd:
   4203 ; CHECK:       ## BB#0:
   4204 ; CHECK-NEXT:    vcvtusi2sdq %rdi, {rd-sae}, %xmm0, %xmm0
   4205 ; CHECK-NEXT:    retq
   4206 {
   4207   %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 1) ; <<<2 x double>> [#uses=1]
   4208   ret <2 x double> %res
   4209 }
   4210 
   4211 define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b)
   4212 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu64_sd:
   4213 ; CHECK:       ## BB#0:
   4214 ; CHECK-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0
   4215 ; CHECK-NEXT:    retq
   4216 {
   4217   %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 4) ; <<<2 x double>> [#uses=1]
   4218   ret <2 x double> %res
   4219 }
   4220 declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64, i32) nounwind readnone
   4221 
   4222 define <8 x i64> @test_vpmaxq(<8 x i64> %a0, <8 x i64> %a1) {
   4223 ; CHECK-LABEL: test_vpmaxq:
   4224 ; CHECK:       ## BB#0:
   4225 ; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   4226 ; CHECK-NEXT:    retq
   4227   %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %a0, <8 x i64> %a1,
   4228                     <8 x i64>zeroinitializer, i8 -1)
   4229   ret <8 x i64> %res
   4230 }
   4231 declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   4232 
   4233 define <16 x i32> @test_vpminud(<16 x i32> %a0, <16 x i32> %a1) {
   4234 ; CHECK-LABEL: test_vpminud:
   4235 ; CHECK:       ## BB#0:
   4236 ; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm0
   4237 ; CHECK-NEXT:    retq
   4238   %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %a0, <16 x i32> %a1,
   4239                     <16 x i32>zeroinitializer, i16 -1)
   4240   ret <16 x i32> %res
   4241 }
   4242 declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4243 
   4244 define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
   4245 ; CHECK-LABEL: test_vpmaxsd:
   4246 ; CHECK:       ## BB#0:
   4247 ; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
   4248 ; CHECK-NEXT:    retq
   4249   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %a0, <16 x i32> %a1,
   4250                     <16 x i32>zeroinitializer, i16 -1)
   4251   ret <16 x i32> %res
   4252 }
   4253 declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4254 
   4255 define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   4256 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
   4257 ; CHECK:       ## BB#0:
   4258 ; CHECK-NEXT:    kmovw %edi, %k1
   4259 ; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm2 {%k1}
   4260 ; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
   4261 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   4262 ; CHECK-NEXT:    retq
   4263   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4264   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   4265   %res2 = add <16 x i32> %res, %res1
   4266   ret <16 x i32> %res2
   4267 }
   4268 
   4269 define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4270 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
   4271 ; CHECK:       ## BB#0:
   4272 ; CHECK-NEXT:    movzbl %dil, %eax
   4273 ; CHECK-NEXT:    kmovw %eax, %k1
   4274 ; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
   4275 ; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   4276 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   4277 ; CHECK-NEXT:    retq
   4278   %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   4279   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   4280   %res2 = add <8 x i64> %res, %res1
   4281   ret <8 x i64> %res2
   4282 }
   4283 
   4284 declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4285 
   4286 define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   4287 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
   4288 ; CHECK:       ## BB#0:
   4289 ; CHECK-NEXT:    kmovw %edi, %k1
   4290 ; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm2 {%k1}
   4291 ; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
   4292 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   4293 ; CHECK-NEXT:    retq
   4294   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4295   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   4296   %res2 = add <16 x i32> %res, %res1
   4297   ret <16 x i32> %res2
   4298 }
   4299 
   4300 declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   4301 
   4302 define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4303 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
   4304 ; CHECK:       ## BB#0:
   4305 ; CHECK-NEXT:    movzbl %dil, %eax
   4306 ; CHECK-NEXT:    kmovw %eax, %k1
   4307 ; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
   4308 ; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   4309 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   4310 ; CHECK-NEXT:    retq
   4311   %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   4312   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   4313   %res2 = add <8 x i64> %res, %res1
   4314   ret <8 x i64> %res2
   4315 }
   4316 
   4317 declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4318 
   4319 define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   4320 ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_512:
   4321 ; CHECK:       ## BB#0:
   4322 ; CHECK-NEXT:    kmovw %edi, %k1
   4323 ; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm2 {%k1}
   4324 ; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
   4325 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   4326 ; CHECK-NEXT:    retq
   4327   %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4328   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   4329   %res2 = add <16 x i32> %res, %res1
   4330   ret <16 x i32> %res2
   4331 }
   4332 
   4333 declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   4334 
   4335 define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4336 ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
   4337 ; CHECK:       ## BB#0:
   4338 ; CHECK-NEXT:    movzbl %dil, %eax
   4339 ; CHECK-NEXT:    kmovw %eax, %k1
   4340 ; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2 {%k1}
   4341 ; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   4342 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   4343 ; CHECK-NEXT:    retq
   4344   %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   4345   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   4346   %res2 = add <8 x i64> %res, %res1
   4347   ret <8 x i64> %res2
   4348 }
   4349 
   4350 define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   4351 ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_512:
   4352 ; CHECK:       ## BB#0:
   4353 ; CHECK-NEXT:    kmovw %edi, %k1
   4354 ; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm2 {%k1}
   4355 ; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm0
   4356 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   4357 ; CHECK-NEXT:    retq
   4358   %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4359   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   4360   %res2 = add <16 x i32> %res, %res1
   4361   ret <16 x i32> %res2
   4362 }
   4363 
   4364 declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   4365 
   4366 define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4367 ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
   4368 ; CHECK:       ## BB#0:
   4369 ; CHECK-NEXT:    movzbl %dil, %eax
   4370 ; CHECK-NEXT:    kmovw %eax, %k1
   4371 ; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2 {%k1}
   4372 ; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   4373 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   4374 ; CHECK-NEXT:    retq
   4375   %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   4376   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   4377   %res2 = add <8 x i64> %res, %res1
   4378   ret <8 x i64> %res2
   4379 }
   4380 
   4381 declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4382 
   4383 define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
   4384 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
   4385 ; CHECK:       ## BB#0:
   4386 ; CHECK-NEXT:    kmovw %esi, %k1
   4387 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
   4388 ; CHECK-NEXT:    vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
   4389 ; CHECK-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
   4390 ; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
   4391 ; CHECK-NEXT:    retq
   4392   %x2 = load <16 x i32>, <16 x i32>* %x2p
   4393   %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4394   %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
   4395   %res2 = add <16 x i32> %res, %res1
   4396   ret <16 x i32> %res2
   4397 }
   4398 
   4399 declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
   4400 
   4401 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
   4402 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
   4403 ; CHECK:       ## BB#0:
   4404 ; CHECK-NEXT:    movzbl %dil, %eax
   4405 ; CHECK-NEXT:    kmovw %eax, %k1
   4406 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
   4407 ; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
   4408 ; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1
   4409 ; CHECK-NEXT:    vaddpd %zmm1, %zmm3, %zmm0
   4410 ; CHECK-NEXT:    retq
   4411   %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   4412   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
   4413   %res2 = fadd <8 x double> %res, %res1
   4414   ret <8 x double> %res2
   4415 }
   4416 
   4417 declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
   4418 
   4419 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
   4420 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
   4421 ; CHECK:       ## BB#0:
   4422 ; CHECK-NEXT:    kmovw %edi, %k1
   4423 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
   4424 ; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm3 {%k1}
   4425 ; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1
   4426 ; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
   4427 ; CHECK-NEXT:    retq
   4428   %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   4429   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
   4430   %res2 = fadd <16 x float> %res, %res1
   4431   ret <16 x float> %res2
   4432 }
   4433 
   4434 declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   4435 
   4436 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4437 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
   4438 ; CHECK:       ## BB#0:
   4439 ; CHECK-NEXT:    movzbl %dil, %eax
   4440 ; CHECK-NEXT:    kmovw %eax, %k1
   4441 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
   4442 ; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
   4443 ; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
   4444 ; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
   4445 ; CHECK-NEXT:    retq
   4446   %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   4447   %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   4448   %res2 = add <8 x i64> %res, %res1
   4449   ret <8 x i64> %res2
   4450 }
   4451 
   4452 declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4453 
   4454 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
   4455 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
   4456 ; CHECK:       ## BB#0:
   4457 ; CHECK-NEXT:    kmovw %esi, %k1
   4458 ; CHECK-NEXT:    vmovaps %zmm1, %zmm2
   4459 ; CHECK-NEXT:    vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
   4460 ; CHECK-NEXT:    vpermt2d %zmm1, %zmm0, %zmm1
   4461 ; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm0
   4462 ; CHECK-NEXT:    retq
   4463   %x2 = load <16 x i32>, <16 x i32>* %x2p
   4464   %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4465   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1)
   4466   %res2 = add <16 x i32> %res, %res1
   4467   ret <16 x i32> %res2
   4468 }
   4469 
   4470 declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
   4471 
   4472 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
   4473 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
   4474 ; CHECK:       ## BB#0:
   4475 ; CHECK-NEXT:    movzbl %sil, %eax
   4476 ; CHECK-NEXT:    kmovw %eax, %k1
   4477 ; CHECK-NEXT:    vmovaps %zmm1, %zmm2
   4478 ; CHECK-NEXT:    vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
   4479 ; CHECK-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1
   4480 ; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm0
   4481 ; CHECK-NEXT:    retq
   4482   %x2s = load double, double* %x2ptr
   4483   %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
   4484   %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
   4485   %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
   4486   %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1)
   4487   %res2 = fadd <8 x double> %res, %res1
   4488   ret <8 x double> %res2
   4489 }
   4490 
   4491 declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
   4492 
   4493 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
   4494 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
   4495 ; CHECK:       ## BB#0:
   4496 ; CHECK-NEXT:    kmovw %edi, %k1
   4497 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
   4498 ; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3 {%k1} {z}
   4499 ; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1
   4500 ; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
   4501 ; CHECK-NEXT:    retq
   4502   %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
   4503   %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
   4504   %res2 = fadd <16 x float> %res, %res1
   4505   ret <16 x float> %res2
   4506 }
   4507 
   4508 
   4509 declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   4510 
   4511 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4512 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
   4513 ; CHECK:       ## BB#0:
   4514 ; CHECK-NEXT:    movzbl %dil, %eax
   4515 ; CHECK-NEXT:    kmovw %eax, %k1
   4516 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
   4517 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
   4518 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
   4519 ; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
   4520 ; CHECK-NEXT:    retq
   4521   %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   4522   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   4523   %res2 = add <8 x i64> %res, %res1
   4524   ret <8 x i64> %res2
   4525 }
   4526 
   4527 declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4528 
   4529 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   4530 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
   4531 ; CHECK:       ## BB#0:
   4532 ; CHECK-NEXT:    kmovw %edi, %k1
   4533 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
   4534 ; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3 {%k1}
   4535 ; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
   4536 ; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
   4537 ; CHECK-NEXT:    retq
   4538   %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4539   %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   4540   %res2 = add <16 x i32> %res, %res1
   4541   ret <16 x i32> %res2
   4542 }
   4543 
   4544 declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
   4545 define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
   4546 ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
   4547 ; CHECK:       ## BB#0:
   4548 ; CHECK-NEXT:    movzbl %dil, %eax
   4549 ; CHECK-NEXT:    kmovw %eax, %k1
   4550 ; CHECK-NEXT:    vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4551 ; CHECK-NEXT:    vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
   4552 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   4553 ; CHECK-NEXT:    retq
   4554   %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 3)
   4555   %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
   4556   %res2 = fadd <8 x double> %res, %res1
   4557   ret <8 x double> %res2
   4558 }
   4559 
   4560 declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
   4561 define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
   4562 ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
   4563 ; CHECK:       ## BB#0:
   4564 ; CHECK-NEXT:    kmovw %edi, %k1
   4565 ; CHECK-NEXT:    vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4566 ; CHECK-NEXT:    vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
   4567 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   4568 ; CHECK-NEXT:    retq
   4569   %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 2)
   4570   %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
   4571   %res2 = fadd <16 x float> %res, %res1
   4572   ret <16 x float> %res2
   4573 }
   4574 
   4575 declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
   4576 
   4577 define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
   4578 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
   4579 ; CHECK:       ## BB#0:
   4580 ; CHECK-NEXT:    movzbl %dil, %eax
   4581 ; CHECK-NEXT:    kmovw %eax, %k1
   4582 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
   4583 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
   4584 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   4585 ; CHECK-NEXT:    retq
   4586   %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
   4587   %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
   4588   %res2 = fadd <8 x double> %res, %res1
   4589   ret <8 x double> %res2
   4590 }
   4591 
   4592 declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
   4593 
   4594 define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
   4595 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
   4596 ; CHECK:       ## BB#0:
   4597 ; CHECK-NEXT:    kmovw %edi, %k1
   4598 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
   4599 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
   4600 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   4601 ; CHECK-NEXT:    retq
   4602   %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
   4603   %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
   4604   %res2 = fadd <16 x float> %res, %res1
   4605   ret <16 x float> %res2
   4606 }
   4607 
   4608 declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
   4609 
   4610 define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
   4611 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
   4612 ; CHECK:       ## BB#0:
   4613 ; CHECK-NEXT:    movzbl %dil, %eax
   4614 ; CHECK-NEXT:    kmovw %eax, %k1
   4615 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
   4616 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   4617 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   4618 ; CHECK-NEXT:    retq
   4619   %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
   4620   %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
   4621   %res2 = fadd <8 x double> %res, %res1
   4622   ret <8 x double> %res2
   4623 }
   4624 
   4625 declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
   4626 
   4627 define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
   4628 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
   4629 ; CHECK:       ## BB#0:
   4630 ; CHECK-NEXT:    kmovw %edi, %k1
   4631 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
   4632 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
   4633 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   4634 ; CHECK-NEXT:    retq
   4635   %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
   4636   %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
   4637   %res2 = fadd <16 x float> %res, %res1
   4638   ret <16 x float> %res2
   4639 }
   4640 
   4641 declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   4642 
   4643 define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4644 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
   4645 ; CHECK:       ## BB#0:
   4646 ; CHECK-NEXT:    movzbl %dil, %eax
   4647 ; CHECK-NEXT:    kmovw %eax, %k1
   4648 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
   4649 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = k1[0],zmm0[0],k1[2],zmm0[2],k1[4],zmm0[4],k1[6],zmm0[6]
   4650 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   4651 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   4652 ; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
   4653 ; CHECK-NEXT:    retq
   4654   %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   4655   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   4656   %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
   4657   %res3 = add <8 x i64> %res, %res1
   4658   %res4 = add <8 x i64> %res2, %res3
   4659   ret <8 x i64> %res4
   4660 }
   4661 
   4662 declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   4663 
   4664 define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4665 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
   4666 ; CHECK:       ## BB#0:
   4667 ; CHECK-NEXT:    movzbl %dil, %eax
   4668 ; CHECK-NEXT:    kmovw %eax, %k1
   4669 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
   4670 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
   4671 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   4672 ; CHECK-NEXT:    retq
   4673   %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   4674   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   4675   %res2 = add <8 x i64> %res, %res1
   4676   ret <8 x i64> %res2
   4677 }
   4678 
   4679 declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4680 
   4681 define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   4682 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
   4683 ; CHECK:       ## BB#0:
   4684 ; CHECK-NEXT:    kmovw %edi, %k1
   4685 ; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
   4686 ; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
   4687 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   4688 ; CHECK-NEXT:    retq
   4689   %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4690   %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   4691   %res2 = add <16 x i32> %res, %res1
   4692   ret <16 x i32> %res2
   4693 }
   4694 
   4695 declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   4696 
   4697 define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   4698 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
   4699 ; CHECK:       ## BB#0:
   4700 ; CHECK-NEXT:    kmovw %edi, %k1
   4701 ; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
   4702 ; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
   4703 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   4704 ; CHECK-NEXT:    retq
   4705   %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   4706   %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   4707   %res2 = add <16 x i32> %res, %res1
   4708   ret <16 x i32> %res2
   4709 }
   4710 
   4711 declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
   4712 
   4713 define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
   4714 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
   4715 ; CHECK:       ## BB#0:
   4716 ; CHECK-NEXT:    kmovw %edi, %k1
   4717 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
   4718 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm2 {%k1} {z}
   4719 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm0
   4720 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   4721 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   4722 ; CHECK-NEXT:    retq
   4723     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
   4724     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
   4725     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
   4726     %res3 = add <16 x i8> %res0, %res1
   4727     %res4 = add <16 x i8> %res3, %res2
   4728     ret <16 x i8> %res4
   4729 }
   4730 
   4731 declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
   4732 
   4733 define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   4734 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
   4735 ; CHECK:       ## BB#0:
   4736 ; CHECK-NEXT:    movzbl %sil, %eax
   4737 ; CHECK-NEXT:    kmovw %eax, %k1
   4738 ; CHECK-NEXT:    vpmovqb %zmm0, (%rdi)
   4739 ; CHECK-NEXT:    vpmovqb %zmm0, (%rdi) {%k1}
   4740 ; CHECK-NEXT:    retq
   4741     call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   4742     call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   4743     ret void
   4744 }
   4745 
   4746 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
   4747 
   4748 define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
   4749 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
   4750 ; CHECK:       ## BB#0:
   4751 ; CHECK-NEXT:    kmovw %edi, %k1
   4752 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
   4753 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm2 {%k1} {z}
   4754 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm0
   4755 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   4756 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   4757 ; CHECK-NEXT:    retq
   4758     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
   4759     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
   4760     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
   4761     %res3 = add <16 x i8> %res0, %res1
   4762     %res4 = add <16 x i8> %res3, %res2
   4763     ret <16 x i8> %res4
   4764 }
   4765 
   4766 declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8)
   4767 
   4768 define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   4769 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
   4770 ; CHECK:       ## BB#0:
   4771 ; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi)
   4772 ; CHECK-NEXT:    kmovw %esi, %k1
   4773 ; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi) {%k1}
   4774 ; CHECK-NEXT:    retq
   4775     call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   4776     call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   4777     ret void
   4778 }
   4779 
   4780 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
   4781 
   4782 define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
   4783 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
   4784 ; CHECK:       ## BB#0:
   4785 ; CHECK-NEXT:    kmovw %edi, %k1
   4786 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
   4787 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm2 {%k1} {z}
   4788 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm0
   4789 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   4790 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   4791 ; CHECK-NEXT:    retq
   4792     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
   4793     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
   4794     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
   4795     %res3 = add <16 x i8> %res0, %res1
   4796     %res4 = add <16 x i8> %res3, %res2
   4797     ret <16 x i8> %res4
   4798 }
   4799 
   4800 declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8)
   4801 
   4802 define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   4803 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
   4804 ; CHECK:       ## BB#0:
   4805 ; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi)
   4806 ; CHECK-NEXT:    kmovw %esi, %k1
   4807 ; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi) {%k1}
   4808 ; CHECK-NEXT:    retq
   4809     call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   4810     call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   4811     ret void
   4812 }
   4813 
   4814 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
   4815 
   4816 define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
   4817 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
   4818 ; CHECK:       ## BB#0:
   4819 ; CHECK-NEXT:    movzbl %dil, %eax
   4820 ; CHECK-NEXT:    kmovw %eax, %k1
   4821 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
   4822 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm2 {%k1} {z}
   4823 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
   4824 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   4825 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
   4826 ; CHECK-NEXT:    retq
   4827     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
   4828     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
   4829     %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
   4830     %res3 = add <8 x i16> %res0, %res1
   4831     %res4 = add <8 x i16> %res3, %res2
   4832     ret <8 x i16> %res4
   4833 }
   4834 
   4835 declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
   4836 
   4837 define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   4838 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
   4839 ; CHECK:       ## BB#0:
   4840 ; CHECK-NEXT:    movzbl %sil, %eax
   4841 ; CHECK-NEXT:    kmovw %eax, %k1
   4842 ; CHECK-NEXT:    vpmovqw %zmm0, (%rdi)
   4843 ; CHECK-NEXT:    vpmovqw %zmm0, (%rdi) {%k1}
   4844 ; CHECK-NEXT:    retq
   4845     call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   4846     call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   4847     ret void
   4848 }
   4849 
   4850 declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
   4851 
   4852 define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
   4853 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
   4854 ; CHECK:       ## BB#0:
   4855 ; CHECK-NEXT:    movzbl %dil, %eax
   4856 ; CHECK-NEXT:    kmovw %eax, %k1
   4857 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
   4858 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm2 {%k1} {z}
   4859 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm0
   4860 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   4861 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
   4862 ; CHECK-NEXT:    retq
   4863     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
   4864     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
   4865     %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
   4866     %res3 = add <8 x i16> %res0, %res1
   4867     %res4 = add <8 x i16> %res3, %res2
   4868     ret <8 x i16> %res4
   4869 }
   4870 
   4871 declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8)
   4872 
   4873 define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   4874 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
   4875 ; CHECK:       ## BB#0:
   4876 ; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi)
   4877 ; CHECK-NEXT:    kmovw %esi, %k1
   4878 ; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi) {%k1}
   4879 ; CHECK-NEXT:    retq
   4880     call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   4881     call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   4882     ret void
   4883 }
   4884 
   4885 declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
   4886 
   4887 define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
   4888 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
   4889 ; CHECK:       ## BB#0:
   4890 ; CHECK-NEXT:    movzbl %dil, %eax
   4891 ; CHECK-NEXT:    kmovw %eax, %k1
   4892 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
   4893 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm2 {%k1} {z}
   4894 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm0
   4895 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   4896 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
   4897 ; CHECK-NEXT:    retq
   4898     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
   4899     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
   4900     %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
   4901     %res3 = add <8 x i16> %res0, %res1
   4902     %res4 = add <8 x i16> %res3, %res2
   4903     ret <8 x i16> %res4
   4904 }
   4905 
   4906 declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8)
   4907 
   4908 define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   4909 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
   4910 ; CHECK:       ## BB#0:
   4911 ; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi)
   4912 ; CHECK-NEXT:    kmovw %esi, %k1
   4913 ; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi) {%k1}
   4914 ; CHECK-NEXT:    retq
   4915     call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   4916     call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   4917     ret void
   4918 }
   4919 
   4920 declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
   4921 
   4922 define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
   4923 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
   4924 ; CHECK:       ## BB#0:
   4925 ; CHECK-NEXT:    movzbl %dil, %eax
   4926 ; CHECK-NEXT:    kmovw %eax, %k1
   4927 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
   4928 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm2 {%k1} {z}
   4929 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
   4930 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   4931 ; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
   4932 ; CHECK-NEXT:    retq
   4933     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
   4934     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
   4935     %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
   4936     %res3 = add <8 x i32> %res0, %res1
   4937     %res4 = add <8 x i32> %res3, %res2
   4938     ret <8 x i32> %res4
   4939 }
   4940 
   4941 declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
   4942 
   4943 define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   4944 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
   4945 ; CHECK:       ## BB#0:
   4946 ; CHECK-NEXT:    movzbl %sil, %eax
   4947 ; CHECK-NEXT:    kmovw %eax, %k1
   4948 ; CHECK-NEXT:    vpmovqd %zmm0, (%rdi)
   4949 ; CHECK-NEXT:    vpmovqd %zmm0, (%rdi) {%k1}
   4950 ; CHECK-NEXT:    retq
   4951     call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   4952     call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   4953     ret void
   4954 }
   4955 
   4956 declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
   4957 
   4958 define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
   4959 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
   4960 ; CHECK:       ## BB#0:
   4961 ; CHECK-NEXT:    movzbl %dil, %eax
   4962 ; CHECK-NEXT:    kmovw %eax, %k1
   4963 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
   4964 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm2 {%k1} {z}
   4965 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm0
   4966 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   4967 ; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
   4968 ; CHECK-NEXT:    retq
   4969     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
   4970     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
   4971     %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
   4972     %res3 = add <8 x i32> %res0, %res1
   4973     %res4 = add <8 x i32> %res3, %res2
   4974     ret <8 x i32> %res4
   4975 }
   4976 
   4977 declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8)
   4978 
   4979 define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   4980 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
   4981 ; CHECK:       ## BB#0:
   4982 ; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi)
   4983 ; CHECK-NEXT:    kmovw %esi, %k1
   4984 ; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi) {%k1}
   4985 ; CHECK-NEXT:    retq
   4986     call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   4987     call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   4988     ret void
   4989 }
   4990 
   4991 declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
   4992 
   4993 define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
   4994 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
   4995 ; CHECK:       ## BB#0:
   4996 ; CHECK-NEXT:    movzbl %dil, %eax
   4997 ; CHECK-NEXT:    kmovw %eax, %k1
   4998 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
   4999 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm2 {%k1} {z}
   5000 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm0
   5001 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   5002 ; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
   5003 ; CHECK-NEXT:    retq
   5004     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
   5005     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
   5006     %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
   5007     %res3 = add <8 x i32> %res0, %res1
   5008     %res4 = add <8 x i32> %res3, %res2
   5009     ret <8 x i32> %res4
   5010 }
   5011 
   5012 declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8)
   5013 
   5014 define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   5015 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
   5016 ; CHECK:       ## BB#0:
   5017 ; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi)
   5018 ; CHECK-NEXT:    kmovw %esi, %k1
   5019 ; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi) {%k1}
   5020 ; CHECK-NEXT:    retq
   5021     call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   5022     call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   5023     ret void
   5024 }
   5025 
   5026 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
   5027 
   5028 define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
   5029 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512:
   5030 ; CHECK:       ## BB#0:
   5031 ; CHECK-NEXT:    kmovw %edi, %k1
   5032 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
   5033 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm2 {%k1} {z}
   5034 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
   5035 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   5036 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   5037 ; CHECK-NEXT:    retq
   5038     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
   5039     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
   5040     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
   5041     %res3 = add <16 x i8> %res0, %res1
   5042     %res4 = add <16 x i8> %res3, %res2
   5043     ret <16 x i8> %res4
   5044 }
   5045 
   5046 declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16)
   5047 
   5048 define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   5049 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
   5050 ; CHECK:       ## BB#0:
   5051 ; CHECK-NEXT:    kmovw %esi, %k1
   5052 ; CHECK-NEXT:    vpmovdb %zmm0, (%rdi)
   5053 ; CHECK-NEXT:    vpmovdb %zmm0, (%rdi) {%k1}
   5054 ; CHECK-NEXT:    retq
   5055     call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   5056     call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   5057     ret void
   5058 }
   5059 
   5060 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
   5061 
   5062 define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
   5063 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
   5064 ; CHECK:       ## BB#0:
   5065 ; CHECK-NEXT:    kmovw %edi, %k1
   5066 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
   5067 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm2 {%k1} {z}
   5068 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm0
   5069 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   5070 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   5071 ; CHECK-NEXT:    retq
   5072     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
   5073     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
   5074     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
   5075     %res3 = add <16 x i8> %res0, %res1
   5076     %res4 = add <16 x i8> %res3, %res2
   5077     ret <16 x i8> %res4
   5078 }
   5079 
   5080 declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16)
   5081 
   5082 define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   5083 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
   5084 ; CHECK:       ## BB#0:
   5085 ; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi)
   5086 ; CHECK-NEXT:    kmovw %esi, %k1
   5087 ; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi) {%k1}
   5088 ; CHECK-NEXT:    retq
   5089     call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   5090     call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   5091     ret void
   5092 }
   5093 
   5094 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
   5095 
   5096 define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
   5097 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
   5098 ; CHECK:       ## BB#0:
   5099 ; CHECK-NEXT:    kmovw %edi, %k1
   5100 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
   5101 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm2 {%k1} {z}
   5102 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm0
   5103 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   5104 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   5105 ; CHECK-NEXT:    retq
   5106     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
   5107     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
   5108     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
   5109     %res3 = add <16 x i8> %res0, %res1
   5110     %res4 = add <16 x i8> %res3, %res2
   5111     ret <16 x i8> %res4
   5112 }
   5113 
   5114 declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16)
   5115 
   5116 define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   5117 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
   5118 ; CHECK:       ## BB#0:
   5119 ; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi)
   5120 ; CHECK-NEXT:    kmovw %esi, %k1
   5121 ; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi) {%k1}
   5122 ; CHECK-NEXT:    retq
   5123     call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   5124     call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   5125     ret void
   5126 }
   5127 
   5128 declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
   5129 
   5130 define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
   5131 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
   5132 ; CHECK:       ## BB#0:
   5133 ; CHECK-NEXT:    kmovw %edi, %k1
   5134 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
   5135 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm2 {%k1} {z}
   5136 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
   5137 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
   5138 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
   5139 ; CHECK-NEXT:    retq
   5140     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
   5141     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
   5142     %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
   5143     %res3 = add <16 x i16> %res0, %res1
   5144     %res4 = add <16 x i16> %res3, %res2
   5145     ret <16 x i16> %res4
   5146 }
   5147 
   5148 declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16)
   5149 
   5150 define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   5151 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
   5152 ; CHECK:       ## BB#0:
   5153 ; CHECK-NEXT:    kmovw %esi, %k1
   5154 ; CHECK-NEXT:    vpmovdw %zmm0, (%rdi)
   5155 ; CHECK-NEXT:    vpmovdw %zmm0, (%rdi) {%k1}
   5156 ; CHECK-NEXT:    retq
   5157     call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   5158     call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   5159     ret void
   5160 }
   5161 
   5162 declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
   5163 
   5164 define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
   5165 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
   5166 ; CHECK:       ## BB#0:
   5167 ; CHECK-NEXT:    kmovw %edi, %k1
   5168 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
   5169 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm2 {%k1} {z}
   5170 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm0
   5171 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
   5172 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
   5173 ; CHECK-NEXT:    retq
   5174     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
   5175     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
   5176     %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
   5177     %res3 = add <16 x i16> %res0, %res1
   5178     %res4 = add <16 x i16> %res3, %res2
   5179     ret <16 x i16> %res4
   5180 }
   5181 
   5182 declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16)
   5183 
   5184 define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   5185 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
   5186 ; CHECK:       ## BB#0:
   5187 ; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi)
   5188 ; CHECK-NEXT:    kmovw %esi, %k1
   5189 ; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi) {%k1}
   5190 ; CHECK-NEXT:    retq
   5191     call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   5192     call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   5193     ret void
   5194 }
   5195 
   5196 declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
   5197 
   5198 define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
   5199 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
   5200 ; CHECK:       ## BB#0:
   5201 ; CHECK-NEXT:    kmovw %edi, %k1
   5202 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
   5203 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm2 {%k1} {z}
   5204 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm0
   5205 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
   5206 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
   5207 ; CHECK-NEXT:    retq
   5208     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
   5209     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
   5210     %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
   5211     %res3 = add <16 x i16> %res0, %res1
   5212     %res4 = add <16 x i16> %res3, %res2
   5213     ret <16 x i16> %res4
   5214 }
   5215 
   5216 declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16)
   5217 
   5218 define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   5219 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
   5220 ; CHECK:       ## BB#0:
   5221 ; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi)
   5222 ; CHECK-NEXT:    kmovw %esi, %k1
   5223 ; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi) {%k1}
   5224 ; CHECK-NEXT:    retq
   5225     call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   5226     call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   5227     ret void
   5228 }
   5229 
   5230 declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
   5231 
   5232 define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
   5233 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
   5234 ; CHECK:       ## BB#0:
   5235 ; CHECK-NEXT:    movzbl %dil, %eax
   5236 ; CHECK-NEXT:    kmovw %eax, %k1
   5237 ; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1 {%k1}
   5238 ; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
   5239 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
   5240 ; CHECK-NEXT:    retq
   5241   %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
   5242   %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
   5243   %res2 = fadd <8 x double> %res, %res1
   5244   ret <8 x double> %res2
   5245 }
   5246 
   5247 declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
   5248 
   5249 define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
   5250 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
   5251 ; CHECK:       ## BB#0:
   5252 ; CHECK-NEXT:    kmovw %edi, %k1
   5253 ; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm1 {%k1}
   5254 ; CHECK-NEXT:    vcvtdq2ps {rn-sae}, %zmm0, %zmm0
   5255 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   5256 ; CHECK-NEXT:    retq
   5257   %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
   5258   %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
   5259   %res2 = fadd <16 x float> %res, %res1
   5260   ret <16 x float> %res2
   5261 }
   5262 
   5263 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
   5264 
   5265 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
   5266 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
   5267 ; CHECK:       ## BB#0:
   5268 ; CHECK-NEXT:    movzbl %dil, %eax
   5269 ; CHECK-NEXT:    kmovw %eax, %k1
   5270 ; CHECK-NEXT:    vcvtpd2dq %zmm0, %ymm1 {%k1}
   5271 ; CHECK-NEXT:    vcvtpd2dq {rn-sae}, %zmm0, %ymm0
   5272 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   5273 ; CHECK-NEXT:    retq
   5274   %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
   5275   %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
   5276   %res2 = add <8 x i32> %res, %res1
   5277   ret <8 x i32> %res2
   5278 }
   5279 
   5280 declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
   5281 
   5282 define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
   5283 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
   5284 ; CHECK:       ## BB#0:
   5285 ; CHECK-NEXT:    movzbl %dil, %eax
   5286 ; CHECK-NEXT:    kmovw %eax, %k1
   5287 ; CHECK-NEXT:    vcvtpd2ps %zmm0, %ymm1 {%k1}
   5288 ; CHECK-NEXT:    vcvtpd2ps {ru-sae}, %zmm0, %ymm0
   5289 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
   5290 ; CHECK-NEXT:    retq
   5291   %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
   5292   %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 2)
   5293   %res2 = fadd <8 x float> %res, %res1
   5294   ret <8 x float> %res2
   5295 }
   5296 
   5297 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
   5298 
   5299 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
   5300 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
   5301 ; CHECK:       ## BB#0:
   5302 ; CHECK-NEXT:    movzbl %dil, %eax
   5303 ; CHECK-NEXT:    kmovw %eax, %k1
   5304 ; CHECK-NEXT:    vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
   5305 ; CHECK-NEXT:    vcvtpd2udq {rn-sae}, %zmm0, %ymm0
   5306 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   5307 ; CHECK-NEXT:    retq
   5308   %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 2)
   5309   %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
   5310   %res2 = add <8 x i32> %res, %res1
   5311   ret <8 x i32> %res2
   5312 }
   5313 
   5314 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
   5315 
   5316 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
   5317 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
   5318 ; CHECK:       ## BB#0:
   5319 ; CHECK-NEXT:    kmovw %edi, %k1
   5320 ; CHECK-NEXT:    vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
   5321 ; CHECK-NEXT:    vcvtps2dq {rn-sae}, %zmm0, %zmm0
   5322 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   5323 ; CHECK-NEXT:    retq
   5324   %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
   5325   %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
   5326   %res2 = add <16 x i32> %res, %res1
   5327   ret <16 x i32> %res2
   5328 }
   5329 
   5330 declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
   5331 
   5332 define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
   5333 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
   5334 ; CHECK:       ## BB#0:
   5335 ; CHECK-NEXT:    movzbl %dil, %eax
   5336 ; CHECK-NEXT:    kmovw %eax, %k1
   5337 ; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm1 {%k1}
   5338 ; CHECK-NEXT:    vcvtps2pd {sae}, %ymm0, %zmm0
   5339 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
   5340 ; CHECK-NEXT:    retq
   5341   %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
   5342   %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
   5343   %res2 = fadd <8 x double> %res, %res1
   5344   ret <8 x double> %res2
   5345 }
   5346 
   5347 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
   5348 
   5349 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
   5350 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
   5351 ; CHECK:       ## BB#0:
   5352 ; CHECK-NEXT:    kmovw %edi, %k1
   5353 ; CHECK-NEXT:    vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
   5354 ; CHECK-NEXT:    vcvtps2udq {rn-sae}, %zmm0, %zmm0
   5355 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   5356 ; CHECK-NEXT:    retq
   5357   %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
   5358   %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
   5359   %res2 = add <16 x i32> %res, %res1
   5360   ret <16 x i32> %res2
   5361 }
   5362 
   5363 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
   5364 
   5365 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
   5366 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
   5367 ; CHECK:       ## BB#0:
   5368 ; CHECK-NEXT:    movzbl %dil, %eax
   5369 ; CHECK-NEXT:    kmovw %eax, %k1
   5370 ; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm1 {%k1}
   5371 ; CHECK-NEXT:    vcvttpd2dq {sae}, %zmm0, %ymm0
   5372 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   5373 ; CHECK-NEXT:    retq
   5374   %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
   5375   %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
   5376   %res2 = add <8 x i32> %res, %res1
   5377   ret <8 x i32> %res2
   5378 }
   5379 
   5380 declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
   5381 
   5382 define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
   5383 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
   5384 ; CHECK:       ## BB#0:
   5385 ; CHECK-NEXT:    movzbl %dil, %eax
   5386 ; CHECK-NEXT:    kmovw %eax, %k1
   5387 ; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm1 {%k1}
   5388 ; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0
   5389 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
   5390 ; CHECK-NEXT:    retq
   5391   %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
   5392   %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
   5393   %res2 = fadd <8 x double> %res, %res1
   5394   ret <8 x double> %res2
   5395 }
   5396 
   5397 
   5398 declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
   5399 
   5400 define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
   5401 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
   5402 ; CHECK:       ## BB#0:
   5403 ; CHECK-NEXT:    kmovw %edi, %k1
   5404 ; CHECK-NEXT:    vcvtudq2ps %zmm0, %zmm1 {%k1}
   5405 ; CHECK-NEXT:    vcvtudq2ps {rn-sae}, %zmm0, %zmm0
   5406 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   5407 ; CHECK-NEXT:    retq
   5408   %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
   5409   %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
   5410   %res2 = fadd <16 x float> %res, %res1
   5411   ret <16 x float> %res2
   5412 }
   5413 
   5414 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
   5415 
   5416 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
   5417 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
   5418 ; CHECK:       ## BB#0:
   5419 ; CHECK-NEXT:    movzbl %dil, %eax
   5420 ; CHECK-NEXT:    kmovw %eax, %k1
   5421 ; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm1 {%k1}
   5422 ; CHECK-NEXT:    vcvttpd2udq {sae}, %zmm0, %ymm0
   5423 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   5424 ; CHECK-NEXT:    retq
   5425   %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
   5426   %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
   5427   %res2 = add <8 x i32> %res, %res1
   5428   ret <8 x i32> %res2
   5429 }
   5430 
   5431 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
   5432 
   5433 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
   5434 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
   5435 ; CHECK:       ## BB#0:
   5436 ; CHECK-NEXT:    kmovw %edi, %k1
   5437 ; CHECK-NEXT:    vcvttps2dq %zmm0, %zmm1 {%k1}
   5438 ; CHECK-NEXT:    vcvttps2dq {sae}, %zmm0, %zmm0
   5439 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   5440 ; CHECK-NEXT:    retq
   5441   %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
   5442   %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
   5443   %res2 = add <16 x i32> %res, %res1
   5444   ret <16 x i32> %res2
   5445 }
   5446 
   5447 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
   5448 
   5449 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
   5450 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
   5451 ; CHECK:       ## BB#0:
   5452 ; CHECK-NEXT:    kmovw %edi, %k1
   5453 ; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm1 {%k1}
   5454 ; CHECK-NEXT:    vcvttps2udq {sae}, %zmm0, %zmm0
   5455 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   5456 ; CHECK-NEXT:    retq
   5457   %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
   5458   %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
   5459   %res2 = add <16 x i32> %res, %res1
   5460   ret <16 x i32> %res2
   5461 }
   5462 
   5463 
   5464 declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
   5465 define <4 x float>@test_int_x86_avx512_mask_scalef_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
   5466 ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss:
   5467 ; CHECK:       ## BB#0:
   5468 ; CHECK-NEXT:    andl $1, %edi
   5469 ; CHECK-NEXT:    kmovw %edi, %k1
   5470 ; CHECK-NEXT:    vscalefss %xmm1, %xmm0, %xmm2 {%k1}
   5471 ; CHECK-NEXT:    vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
   5472 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
   5473 ; CHECK-NEXT:    retq
   5474   %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4)
   5475   %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 8)
   5476   %res2 = fadd <4 x float> %res, %res1
   5477   ret <4 x float> %res2
   5478 }
   5479 
   5480 declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
   5481 define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
   5482 ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd:
   5483 ; CHECK:       ## BB#0:
   5484 ; CHECK-NEXT:    andl $1, %edi
   5485 ; CHECK-NEXT:    kmovw %edi, %k1
   5486 ; CHECK-NEXT:    vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
   5487 ; CHECK-NEXT:    vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
   5488 ; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
   5489 ; CHECK-NEXT:    retq
   5490   %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4)
   5491   %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 8)
   5492   %res2 = fadd <2 x double> %res, %res1
   5493   ret <2 x double> %res2
   5494 }
   5495 
   5496 declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
   5497 
   5498 define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   5499 ; CHECK-LABEL: test_getexp_ss:
   5500 ; CHECK:       ## BB#0:
   5501 ; CHECK-NEXT:    andl $1, %edi
   5502 ; CHECK-NEXT:    kmovw %edi, %k1
   5503 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   5504 ; CHECK-NEXT:    vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
   5505 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5506 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
   5507 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm0
   5508 ; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
   5509 ; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
   5510 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
   5511 ; CHECK-NEXT:    retq
   5512   %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   5513   %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
   5514   %res2 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
   5515   %res3 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
   5516 
   5517   %res.1 = fadd <4 x float> %res0, %res1
   5518   %res.2 = fadd <4 x float> %res2, %res3
   5519   %res   = fadd <4 x float> %res.1, %res.2
   5520   ret <4 x float> %res
   5521 }
   5522 
   5523 declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
   5524 
   5525 define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   5526 ; CHECK-LABEL: test_getexp_sd:
   5527 ; CHECK:       ## BB#0:
   5528 ; CHECK-NEXT:    andl $1, %edi
   5529 ; CHECK-NEXT:    kmovw %edi, %k1
   5530 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   5531 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
   5532 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4
   5533 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5534 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   5535 ; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
   5536 ; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
   5537 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
   5538 ; CHECK-NEXT:    retq
   5539   %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   5540   %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
   5541   %res2 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
   5542   %res3 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
   5543 
   5544   %res.1 = fadd <2 x double> %res0, %res1
   5545   %res.2 = fadd <2 x double> %res2, %res3
   5546   %res   = fadd <2 x double> %res.1, %res.2
   5547   ret <2 x double> %res
   5548 }
   5549 
   5550 declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
   5551 
   5552 define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
   5553 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd:
   5554 ; CHECK:       ## BB#0:
   5555 ; CHECK-NEXT:    andl $1, %edi
   5556 ; CHECK-NEXT:    kmovw %edi, %k1
   5557 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
   5558 ; CHECK-NEXT:    kmovw %k0, %eax
   5559 ; CHECK-NEXT:    shlb $7, %al
   5560 ; CHECK-NEXT:    sarb $7, %al
   5561 ; CHECK-NEXT:    retq
   5562 
   5563   %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
   5564   ret i8 %res4
   5565 }
   5566 
   5567 define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
   5568 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
   5569 ; CHECK:       ## BB#0:
   5570 ; CHECK-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
   5571 ; CHECK-NEXT:    vcmplesd %xmm1, %xmm0, %k1
   5572 ; CHECK-NEXT:    korw %k0, %k1, %k0
   5573 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k1
   5574 ; CHECK-NEXT:    vcmpneqsd %xmm1, %xmm0, %k2
   5575 ; CHECK-NEXT:    korw %k1, %k2, %k1
   5576 ; CHECK-NEXT:    andl $1, %edi
   5577 ; CHECK-NEXT:    kmovw %edi, %k2
   5578 ; CHECK-NEXT:    kandw %k2, %k1, %k1
   5579 ; CHECK-NEXT:    korw %k1, %k0, %k0
   5580 ; CHECK-NEXT:    kmovw %k0, %eax
   5581 ; CHECK-NEXT:    shlb $7, %al
   5582 ; CHECK-NEXT:    sarb $7, %al
   5583 ; CHECK-NEXT:    retq
   5584 
   5585   %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
   5586   %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
   5587   %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
   5588   %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
   5589 
   5590   %res11 = or i8 %res1, %res2
   5591   %res12 = or i8 %res3, %res4
   5592   %res13 = or i8 %res11, %res12
   5593   ret i8 %res13
   5594 }
   5595 
   5596 declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
   5597 
   5598 define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
   5599 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
   5600 ; CHECK:       ## BB#0:
   5601 ; CHECK-NEXT:    andl $1, %edi
   5602 ; CHECK-NEXT:    kmovw %edi, %k1
   5603 ; CHECK-NEXT:    vcmpunordss %xmm1, %xmm0, %k0 {%k1}
   5604 ; CHECK-NEXT:    kmovw %k0, %eax
   5605 ; CHECK-NEXT:    shlb $7, %al
   5606 ; CHECK-NEXT:    sarb $7, %al
   5607 ; CHECK-NEXT:    retq
   5608 
   5609   %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
   5610   ret i8 %res2
   5611 }
   5612 
   5613 
   5614 define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
   5615 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
   5616 ; CHECK:       ## BB#0:
   5617 ; CHECK-NEXT:    vcmpless %xmm1, %xmm0, %k1
   5618 ; CHECK-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1}
   5619 ; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k1
   5620 ; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
   5621 ; CHECK-NEXT:    andl $1, %edi
   5622 ; CHECK-NEXT:    kmovw %edi, %k2
   5623 ; CHECK-NEXT:    kandw %k2, %k1, %k1
   5624 ; CHECK-NEXT:    kandw %k1, %k0, %k0
   5625 ; CHECK-NEXT:    kmovw %k0, %eax
   5626 ; CHECK-NEXT:    shlb $7, %al
   5627 ; CHECK-NEXT:    sarb $7, %al
   5628 ; CHECK-NEXT:    retq
   5629   %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
   5630   %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
   5631   %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
   5632   %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
   5633 
   5634   %res11 = and i8 %res1, %res2
   5635   %res12 = and i8 %res3, %res4
   5636   %res13 = and i8 %res11, %res12
   5637   ret i8 %res13
   5638 }
   5639 
   5640 declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
   5641 
   5642 define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
   5643 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
   5644 ; CHECK:       ## BB#0:
   5645 ; CHECK-NEXT:    kmovw %edi, %k1
   5646 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
   5647 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
   5648 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   5649 ; CHECK-NEXT:    retq
   5650   %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
   5651   %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
   5652   %res2 = fadd <16 x float> %res, %res1
   5653   ret <16 x float> %res2
   5654 }
   5655 
   5656 declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
   5657 
   5658 define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
   5659 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
   5660 ; CHECK:       ## BB#0:
   5661 ; CHECK-NEXT:    movzbl %dil, %eax
   5662 ; CHECK-NEXT:    kmovw %eax, %k1
   5663 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
   5664 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
   5665 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
   5666 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   5667 ; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
   5668 ; CHECK-NEXT:    retq
   5669   %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
   5670   %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
   5671   %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
   5672 
   5673   %res3 = fadd <8 x double> %res, %res1
   5674   %res4 = fadd <8 x double> %res3, %res2
   5675   ret <8 x double> %res4
   5676 }
   5677 
   5678 declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
   5679 
   5680 define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
   5681 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
   5682 ; CHECK:       ## BB#0:
   5683 ; CHECK-NEXT:    kmovw %edi, %k1
   5684 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
   5685 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
   5686 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   5687 ; CHECK-NEXT:    retq
   5688   %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
   5689   %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
   5690   %res2 = add <16 x i32> %res, %res1
   5691   ret <16 x i32> %res2
   5692 }
   5693 
   5694 declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
   5695 
   5696 define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
   5697 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
   5698 ; CHECK:       ## BB#0:
   5699 ; CHECK-NEXT:    movzbl %dil, %eax
   5700 ; CHECK-NEXT:    kmovw %eax, %k1
   5701 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
   5702 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
   5703 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   5704 ; CHECK-NEXT:    retq
   5705   %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
   5706   %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
   5707   %res2 = add <8 x i64> %res, %res1
   5708   ret <8 x i64> %res2
   5709 }
   5710 
   5711 declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
   5712 
   5713 define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
   5714 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
   5715 ; CHECK:       ## BB#0:
   5716 ; CHECK-NEXT:    movzbl %dil, %eax
   5717 ; CHECK-NEXT:    kmovw %eax, %k1
   5718 ; CHECK-NEXT:    vgetmantpd $11, %zmm0, %zmm1 {%k1}
   5719 ; CHECK-NEXT:    vgetmantpd $11,{sae}, %zmm0, %zmm0
   5720 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
   5721 ; CHECK-NEXT:    retq
   5722   %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
   5723   %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
   5724   %res2 = fadd <8 x double> %res, %res1
   5725   ret <8 x double> %res2
   5726 }
   5727 
   5728 declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
   5729 
   5730 define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
   5731 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
   5732 ; CHECK:       ## BB#0:
   5733 ; CHECK-NEXT:    kmovw %edi, %k1
   5734 ; CHECK-NEXT:    vgetmantps $11, %zmm0, %zmm1 {%k1}
   5735 ; CHECK-NEXT:    vgetmantps $11,{sae}, %zmm0, %zmm0
   5736 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   5737 ; CHECK-NEXT:    retq
   5738   %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
   5739   %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
   5740   %res2 = fadd <16 x float> %res, %res1
   5741   ret <16 x float> %res2
   5742 }
   5743 
   5744 declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
   5745 
   5746 define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   5747 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd:
   5748 ; CHECK:       ## BB#0:
   5749 ; CHECK-NEXT:    andl $1, %edi
   5750 ; CHECK-NEXT:    kmovw %edi, %k1
   5751 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   5752 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
   5753 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
   5754 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5
   5755 ; CHECK-NEXT:    vgetmantsd $11,{sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5756 ; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
   5757 ; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
   5758 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   5759 ; CHECK-NEXT:    retq
   5760   %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
   5761   %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
   5762   %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 8)
   5763   %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 -1, i32 4)
   5764   %res11 = fadd <2 x double> %res, %res1
   5765   %res12 = fadd <2 x double> %res2, %res3
   5766   %res13 = fadd <2 x double> %res11, %res12
   5767   ret <2 x double> %res13
   5768 }
   5769 
   5770 declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
   5771 
   5772 define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   5773 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss:
   5774 ; CHECK:       ## BB#0:
   5775 ; CHECK-NEXT:    andl $1, %edi
   5776 ; CHECK-NEXT:    kmovw %edi, %k1
   5777 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
   5778 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
   5779 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4
   5780 ; CHECK-NEXT:    vgetmantss $11,{sae}, %xmm1, %xmm0, %xmm0
   5781 ; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm1
   5782 ; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
   5783 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
   5784 ; CHECK-NEXT:    retq
   5785   %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
   5786   %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
   5787   %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 8)
   5788   %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 4)
   5789   %res11 = fadd <4 x float> %res, %res1
   5790   %res12 = fadd <4 x float> %res2, %res3
   5791   %res13 = fadd <4 x float> %res11, %res12
   5792   ret <4 x float> %res13
   5793 }
   5794 
   5795 declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
   5796 
   5797 define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
   5798 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
   5799 ; CHECK:       ## BB#0:
   5800 ; CHECK-NEXT:    movzbl %dil, %eax
   5801 ; CHECK-NEXT:    kmovw %eax, %k1
   5802 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 = zmm2[0],k1[1],zmm2[3],k1[2],zmm2[5],k1[4],zmm2[6],k1[6]
   5803 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm3 = k1[0],zmm0[1],k1[3],zmm0[2],k1[5],zmm0[4],k1[6],zmm0[6]
   5804 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
   5805 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   5806 ; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
   5807 ; CHECK-NEXT:    retq
   5808   %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
   5809   %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
   5810   %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
   5811 
   5812   %res3 = fadd <8 x double> %res, %res1
   5813   %res4 = fadd <8 x double> %res3, %res2
   5814   ret <8 x double> %res4
   5815 }
   5816 
   5817 declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16)
   5818 
   5819 define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
   5820 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
   5821 ; CHECK:       ## BB#0:
   5822 ; CHECK-NEXT:    kmovw %edi, %k1
   5823 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 = zmm2[2,1],k1[1,0],zmm2[6,5],k1[5,4],zmm2[10,9],k1[9,8],zmm2[14,13],k1[13,12]
   5824 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
   5825 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   5826 ; CHECK-NEXT:    retq
   5827   %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
   5828   %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
   5829   %res2 = fadd <16 x float> %res, %res1
   5830   ret <16 x float> %res2
   5831 }
   5832 
   5833 declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
   5834 
   5835 define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
   5836 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
   5837 ; CHECK:       ## BB#0:
   5838 ; CHECK-NEXT:    movzbl %dil, %eax
   5839 ; CHECK-NEXT:    kmovw %eax, %k1
   5840 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 = zmm1[0,1,3,2,5,4,6,6]
   5841 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm2 = k1[0,1,3,2,5,4,6,6]
   5842 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,1,3,2,5,4,6,6]
   5843 ; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
   5844 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
   5845 ; CHECK-NEXT:    retq
   5846   %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
   5847   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
   5848   %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
   5849   %res3 = fadd <8 x double> %res, %res1
   5850   %res4 = fadd <8 x double> %res3, %res2
   5851   ret <8 x double> %res4
   5852 }
   5853 
   5854 declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
   5855 
   5856 define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
   5857 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
   5858 ; CHECK:       ## BB#0:
   5859 ; CHECK-NEXT:    kmovw %edi, %k1
   5860 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 = zmm1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
   5861 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 = k1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
   5862 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
   5863 ; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
   5864 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   5865 ; CHECK-NEXT:    retq
   5866   %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
   5867   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
   5868   %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
   5869   %res3 = fadd <16 x float> %res, %res1
   5870   %res4 = fadd <16 x float> %res3, %res2
   5871   ret <16 x float> %res4
   5872 }
   5873 
   5874 declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
   5875 
   5876 define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
   5877 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
   5878 ; CHECK:       ## BB#0:
   5879 ; CHECK-NEXT:    movzbl %dil, %eax
   5880 ; CHECK-NEXT:    kmovw %eax, %k1
   5881 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
   5882 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z}
   5883 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0
   5884 ; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
   5885 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
   5886 ; CHECK-NEXT:    retq
   5887   %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   5888   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
   5889   %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
   5890   %res3 = fadd <8 x double> %res, %res1
   5891   %res4 = fadd <8 x double> %res2, %res3
   5892   ret <8 x double> %res4
   5893 }
   5894 
   5895 declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
   5896 
   5897 define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
   5898 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
   5899 ; CHECK:       ## BB#0:
   5900 ; CHECK-NEXT:    kmovw %edi, %k1
   5901 ; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm2 {%k1}
   5902 ; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm3 {%k1} {z}
   5903 ; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm0
   5904 ; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
   5905 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   5906 ; CHECK-NEXT:    retq
   5907   %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   5908   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
   5909   %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
   5910   %res3 = fadd <16 x float> %res, %res1
   5911   %res4 = fadd <16 x float> %res2, %res3
   5912   ret <16 x float> %res4
   5913 }
   5914 
   5915 declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i8)
   5916 
   5917 define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i8 %x4) {
   5918 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
   5919 ; CHECK:       ## BB#0:
   5920 ; CHECK-NEXT:    kmovw %edi, %k1
   5921 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
   5922 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
   5923 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
   5924 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   5925 ; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
   5926 ; CHECK-NEXT:    retq
   5927   %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 %x4)
   5928   %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 -1)
   5929   %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i8 %x4)
   5930   %res3 = fadd <16 x float> %res, %res1
   5931   %res4 = fadd <16 x float> %res2, %res3
   5932   ret <16 x float> %res4
   5933 }
   5934 
   5935 declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i8)
   5936 
   5937 define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i8 %x4) {
   5938 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
   5939 ; CHECK:       ## BB#0:
   5940 ; CHECK-NEXT:    kmovw %edi, %k1
   5941 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
   5942 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
   5943 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
   5944 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   5945 ; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
   5946 ; CHECK-NEXT:    retq
   5947   %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 %x4)
   5948   %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 -1)
   5949   %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i8 %x4)
   5950   %res3 = add <16 x i32> %res, %res1
   5951   %res4 = add <16 x i32> %res2, %res3
   5952   ret <16 x i32> %res4
   5953 }
   5954 
   5955 declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
   5956 
   5957 define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
   5958 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
   5959 ; CHECK:       ## BB#0:
   5960 ; CHECK-NEXT:    movzbl %dil, %eax
   5961 ; CHECK-NEXT:    kmovw %eax, %k1
   5962 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
   5963 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
   5964 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
   5965 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   5966 ; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
   5967 ; CHECK-NEXT:    retq
   5968   %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
   5969   %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
   5970   %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
   5971   %res3 = fadd <8 x double> %res, %res1
   5972   %res4 = fadd <8 x double> %res2, %res3
   5973   ret <8 x double> %res4
   5974 }
   5975 
   5976 declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
   5977 
   5978 define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
   5979 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
   5980 ; CHECK:       ## BB#0:
   5981 ; CHECK-NEXT:    movzbl %dil, %eax
   5982 ; CHECK-NEXT:    kmovw %eax, %k1
   5983 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
   5984 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
   5985 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   5986 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   5987 ; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
   5988 ; CHECK-NEXT:    retq
   5989   %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
   5990   %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
   5991   %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
   5992   %res3 = add <8 x i64> %res, %res1
   5993   %res4 = add <8 x i64> %res2, %res3
   5994   ret <8 x i64> %res4
   5995 }
   5996 
   5997 declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float>, <4 x float>, <2 x double>, i8, i32)
   5998 
   5999 define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<4 x float> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
   6000 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
   6001 ; CHECK:       ## BB#0:
   6002 ; CHECK-NEXT:    andl $1, %edi
   6003 ; CHECK-NEXT:    kmovw %edi, %k1
   6004 ; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
   6005 ; CHECK-NEXT:    vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
   6006 ; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
   6007 ; CHECK-NEXT:    retq
   6008   %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
   6009   %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
   6010   %res2 = fadd <2 x double> %res, %res1
   6011   ret <2 x double> %res2
   6012 }
   6013 
   6014 declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double>, <2 x double>, <4 x float>, i8, i32)
   6015 
   6016 define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<2 x double> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
   6017 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
   6018 ; CHECK:       ## BB#0:
   6019 ; CHECK-NEXT:    andl $1, %edi
   6020 ; CHECK-NEXT:    kmovw %edi, %k1
   6021 ; CHECK-NEXT:    vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   6022 ; CHECK-NEXT:    vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
   6023 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
   6024 ; CHECK-NEXT:    retq
   6025   %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
   6026   %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
   6027   %res2 = fadd <4 x float> %res, %res1
   6028   ret <4 x float> %res2
   6029 }
   6030 
   6031 declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
   6032 
   6033 define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
   6034 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
   6035 ; CHECK:       ## BB#0:
   6036 ; CHECK-NEXT:    kmovw %edi, %k1
   6037 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   6038 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1}
   6039 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
   6040 ; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
   6041 ; CHECK-NEXT:    retq
   6042   %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
   6043   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
   6044   %res2 = add <16 x i32> %res, %res1
   6045   ret <16 x i32> %res2
   6046 }
   6047 
   6048 declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
   6049 
   6050 define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
   6051 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
   6052 ; CHECK:       ## BB#0:
   6053 ; CHECK-NEXT:    kmovw %edi, %k1
   6054 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   6055 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
   6056 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
   6057 ; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
   6058 ; CHECK-NEXT:    retq
   6059   %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
   6060   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
   6061   %res2 = add <16 x i32> %res, %res1
   6062   ret <16 x i32> %res2
   6063 }
   6064 
   6065 declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
   6066 
   6067 define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
   6068 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
   6069 ; CHECK:       ## BB#0:
   6070 ; CHECK-NEXT:    movzbl %dil, %eax
   6071 ; CHECK-NEXT:    kmovw %eax, %k1
   6072 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   6073 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
   6074 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
   6075 ; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
   6076 ; CHECK-NEXT:    retq
   6077   %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
   6078   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
   6079   %res2 = add <8 x i64> %res, %res1
   6080   ret <8 x i64> %res2
   6081 }
   6082 
   6083 declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
   6084 
   6085 define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
   6086 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
   6087 ; CHECK:       ## BB#0:
   6088 ; CHECK-NEXT:    movzbl %dil, %eax
   6089 ; CHECK-NEXT:    kmovw %eax, %k1
   6090 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   6091 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
   6092 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
   6093 ; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
   6094 ; CHECK-NEXT:    retq
   6095   %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
   6096   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
   6097   %res2 = add <8 x i64> %res, %res1
   6098   ret <8 x i64> %res2
   6099 }
   6100 
   6101 declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
   6102 
   6103 define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
   6104 ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
   6105 ; CHECK:       ## BB#0:
   6106 ; CHECK-NEXT:    kmovw %edi, %k1
   6107 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   6108 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   6109 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   6110 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   6111 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   6112 ; CHECK-NEXT:    retq
   6113   %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
   6114   %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
   6115   %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
   6116   %res3 = fadd <16 x float> %res, %res1
   6117   %res4 = fadd <16 x float> %res2, %res3
   6118   ret <16 x float> %res4
   6119 }
   6120 
   6121 declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
   6122 
   6123 define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
   6124 ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
   6125 ; CHECK:       ## BB#0:
   6126 ; CHECK-NEXT:    kmovw %edi, %k1
   6127 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
   6128 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
   6129 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
   6130 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   6131 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   6132 ; CHECK-NEXT:    retq
   6133   %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
   6134   %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
   6135   %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
   6136   %res3 = fadd <16 x float> %res, %res1
   6137   %res4 = fadd <16 x float> %res2, %res3
   6138   ret <16 x float> %res4
   6139 }
   6140 
   6141 declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
   6142 
   6143 define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
   6144 ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
   6145 ; CHECK:       ## BB#0:
   6146 ; CHECK-NEXT:    movzbl %dil, %eax
   6147 ; CHECK-NEXT:    kmovw %eax, %k1
   6148 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6]
   6149 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
   6150 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
   6151 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
   6152 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   6153 ; CHECK-NEXT:    retq
   6154   %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
   6155   %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
   6156   %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
   6157   %res3 = fadd <8 x double> %res, %res1
   6158   %res4 = fadd <8 x double> %res2, %res3
   6159   ret <8 x double> %res4
   6160 }
   6161 
   6162 define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
   6163 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
   6164 ; CHECK:       ## BB#0:
   6165 ; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
   6166 ; CHECK-NEXT:    sete %al
   6167 ; CHECK-NEXT:    movzbl %al, %eax
   6168 ; CHECK-NEXT:    retq
   6169   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
   6170   ret i32 %res
   6171 }
   6172 
   6173 define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
   6174 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
   6175 ; CHECK:       ## BB#0:
   6176 ; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
   6177 ; CHECK-NEXT:    sete %al
   6178 ; CHECK-NEXT:    movzbl %al, %eax
   6179 ; CHECK-NEXT:    retq
   6180   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
   6181   ret i32 %res
   6182 }
   6183 
   6184 define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
   6185 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
   6186 ; CHECK:       ## BB#0:
   6187 ; CHECK-NEXT:    vcomisd %xmm1, %xmm0
   6188 ; CHECK-NEXT:    sete %al
   6189 ; CHECK-NEXT:    movzbl %al, %eax
   6190 ; CHECK-NEXT:    retq
   6191   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
   6192   ret i32 %res
   6193 }
   6194 
   6195 define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
   6196 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
   6197 ; CHECK:       ## BB#0:
   6198 ; CHECK-NEXT:    vucomisd %xmm1, %xmm0
   6199 ; CHECK-NEXT:    sete %al
   6200 ; CHECK-NEXT:    movzbl %al, %eax
   6201 ; CHECK-NEXT:    retq
   6202   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
   6203   ret i32 %res
   6204 }
   6205 
   6206 define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
   6207 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
   6208 ; CHECK:       ## BB#0:
   6209 ; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
   6210 ; CHECK-NEXT:    sbbl %eax, %eax
   6211 ; CHECK-NEXT:    andl $1, %eax
   6212 ; CHECK-NEXT:    retq
   6213   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
   6214   ret i32 %res
   6215 }
   6216 
   6217 define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
   6218 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
   6219 ; CHECK:       ## BB#0:
   6220 ; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
   6221 ; CHECK-NEXT:    sbbl %eax, %eax
   6222 ; CHECK-NEXT:    andl $1, %eax
   6223 ; CHECK-NEXT:    retq
   6224   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
   6225   ret i32 %res
   6226 }
   6227 
   6228 define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
   6229 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
   6230 ; CHECK:       ## BB#0:
   6231 ; CHECK-NEXT:    vcomisd %xmm1, %xmm0
   6232 ; CHECK-NEXT:    sbbl %eax, %eax
   6233 ; CHECK-NEXT:    andl $1, %eax
   6234 ; CHECK-NEXT:    retq
   6235   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
   6236   ret i32 %res
   6237 }
   6238 
   6239 define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
   6240 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
   6241 ; CHECK:       ## BB#0:
   6242 ; CHECK-NEXT:    vucomisd %xmm1, %xmm0
   6243 ; CHECK-NEXT:    sbbl %eax, %eax
   6244 ; CHECK-NEXT:    andl $1, %eax
   6245 ; CHECK-NEXT:    retq
   6246   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
   6247   ret i32 %res
   6248 }
   6249 
   6250 declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
   6251 
   6252 define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
   6253 ; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
   6254 ; CHECK:       ## BB#0:
   6255 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
   6256 ; CHECK-NEXT:    sbbl %eax, %eax
   6257 ; CHECK-NEXT:    andl $1, %eax
   6258 ; CHECK-NEXT:    retq
   6259   %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
   6260   ret i32 %res
   6261 }
   6262 
   6263 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
   6264 declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
   6265 
   6266 define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   6267 ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
   6268 ; CHECK:       ## BB#0:
   6269 ; CHECK-NEXT:    andl $1, %edi
   6270 ; CHECK-NEXT:    kmovw %edi, %k1
   6271 ; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
   6272 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   6273 ; CHECK-NEXT:    retq
   6274   %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   6275   ret <4 x float> %res
   6276 }
   6277 
   6278 define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
   6279 ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
   6280 ; CHECK:       ## BB#0:
   6281 ; CHECK-NEXT:    andl $1, %edi
   6282 ; CHECK-NEXT:    kmovw %edi, %k1
   6283 ; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
   6284 ; CHECK-NEXT:    retq
   6285   %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
   6286   ret <4 x float> %res
   6287 }
   6288 
   6289 define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
   6290 ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
   6291 ; CHECK:       ## BB#0:
   6292 ; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0
   6293 ; CHECK-NEXT:    retq
   6294   %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
   6295   ret <4 x float> %res
   6296 }
   6297 
   6298 declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
   6299 define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
   6300 ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
   6301 ; CHECK:       ## BB#0:
   6302 ; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0
   6303 ; CHECK-NEXT:    retq
   6304   %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
   6305   ret <2 x double> %res
   6306 }
   6307 
   6308 define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
   6309 ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
   6310 ; CHECK:       ## BB#0:
   6311 ; CHECK-NEXT:    andl $1, %edi
   6312 ; CHECK-NEXT:    kmovw %edi, %k1
   6313 ; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   6314 ; CHECK-NEXT:    retq
   6315   %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
   6316   ret <2 x double> %res
   6317 }
   6318 
   6319 define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   6320 ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
   6321 ; CHECK:       ## BB#0:
   6322 ; CHECK-NEXT:    andl $1, %edi
   6323 ; CHECK-NEXT:    kmovw %edi, %k1
   6324 ; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
   6325 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   6326 ; CHECK-NEXT:    retq
   6327   %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   6328   ret <2 x double> %res
   6329 }
   6330 
   6331