Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
      3 
      4 
      5 define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
      6 ; CHECK-LABEL: test_mask_compress_pd_512:
      7 ; CHECK:       ## %bb.0:
      8 ; CHECK-NEXT:    kmovw %edi, %k1
      9 ; CHECK-NEXT:    vcompresspd %zmm0, %zmm1 {%k1}
     10 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
     11 ; CHECK-NEXT:    retq
     12   %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask)
     13   ret <8 x double> %res
     14 }
     15 
     16 define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) {
     17 ; CHECK-LABEL: test_maskz_compress_pd_512:
     18 ; CHECK:       ## %bb.0:
     19 ; CHECK-NEXT:    kmovw %edi, %k1
     20 ; CHECK-NEXT:    vcompresspd %zmm0, %zmm0 {%k1} {z}
     21 ; CHECK-NEXT:    retq
     22   %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
     23   ret <8 x double> %res
     24 }
     25 
     26 define <8 x double> @test_compress_pd_512(<8 x double> %data) {
     27 ; CHECK-LABEL: test_compress_pd_512:
     28 ; CHECK:       ## %bb.0:
     29 ; CHECK-NEXT:    retq
     30   %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> undef, i8 -1)
     31   ret <8 x double> %res
     32 }
     33 
     34 declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
     35 
     36 define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
     37 ; CHECK-LABEL: test_mask_compress_ps_512:
     38 ; CHECK:       ## %bb.0:
     39 ; CHECK-NEXT:    kmovw %edi, %k1
     40 ; CHECK-NEXT:    vcompressps %zmm0, %zmm1 {%k1}
     41 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
     42 ; CHECK-NEXT:    retq
     43   %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask)
     44   ret <16 x float> %res
     45 }
     46 
     47 define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) {
     48 ; CHECK-LABEL: test_maskz_compress_ps_512:
     49 ; CHECK:       ## %bb.0:
     50 ; CHECK-NEXT:    kmovw %edi, %k1
     51 ; CHECK-NEXT:    vcompressps %zmm0, %zmm0 {%k1} {z}
     52 ; CHECK-NEXT:    retq
     53   %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask)
     54   ret <16 x float> %res
     55 }
     56 
     57 define <16 x float> @test_compress_ps_512(<16 x float> %data) {
     58 ; CHECK-LABEL: test_compress_ps_512:
     59 ; CHECK:       ## %bb.0:
     60 ; CHECK-NEXT:    retq
     61   %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> undef, i16 -1)
     62   ret <16 x float> %res
     63 }
     64 
     65 declare <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask)
     66 
     67 define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
     68 ; CHECK-LABEL: test_mask_compress_q_512:
     69 ; CHECK:       ## %bb.0:
     70 ; CHECK-NEXT:    kmovw %edi, %k1
     71 ; CHECK-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
     72 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
     73 ; CHECK-NEXT:    retq
     74   %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)
     75   ret <8 x i64> %res
     76 }
     77 
     78 define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) {
     79 ; CHECK-LABEL: test_maskz_compress_q_512:
     80 ; CHECK:       ## %bb.0:
     81 ; CHECK-NEXT:    kmovw %edi, %k1
     82 ; CHECK-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
     83 ; CHECK-NEXT:    retq
     84   %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask)
     85   ret <8 x i64> %res
     86 }
     87 
     88 define <8 x i64> @test_compress_q_512(<8 x i64> %data) {
     89 ; CHECK-LABEL: test_compress_q_512:
     90 ; CHECK:       ## %bb.0:
     91 ; CHECK-NEXT:    retq
     92   %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1)
     93   ret <8 x i64> %res
     94 }
     95 
     96 declare <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask)
     97 
     98 define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
     99 ; CHECK-LABEL: test_mask_compress_d_512:
    100 ; CHECK:       ## %bb.0:
    101 ; CHECK-NEXT:    kmovw %edi, %k1
    102 ; CHECK-NEXT:    vpcompressd %zmm0, %zmm1 {%k1}
    103 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    104 ; CHECK-NEXT:    retq
    105   %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)
    106   ret <16 x i32> %res
    107 }
    108 
    109 define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) {
    110 ; CHECK-LABEL: test_maskz_compress_d_512:
    111 ; CHECK:       ## %bb.0:
    112 ; CHECK-NEXT:    kmovw %edi, %k1
    113 ; CHECK-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
    114 ; CHECK-NEXT:    retq
    115   %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask)
    116   ret <16 x i32> %res
    117 }
    118 
    119 define <16 x i32> @test_compress_d_512(<16 x i32> %data) {
    120 ; CHECK-LABEL: test_compress_d_512:
    121 ; CHECK:       ## %bb.0:
    122 ; CHECK-NEXT:    retq
    123   %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1)
    124   ret <16 x i32> %res
    125 }
    126 
    127 declare <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
    128 
    129 define <8 x double> @test_expand_pd_512(<8 x double> %data) {
    130 ; CHECK-LABEL: test_expand_pd_512:
    131 ; CHECK:       ## %bb.0:
    132 ; CHECK-NEXT:    retq
    133   %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> undef, i8 -1)
    134   ret <8 x double> %res
    135 }
    136 
    137 define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
    138 ; CHECK-LABEL: test_mask_expand_pd_512:
    139 ; CHECK:       ## %bb.0:
    140 ; CHECK-NEXT:    kmovw %edi, %k1
    141 ; CHECK-NEXT:    vexpandpd %zmm0, %zmm1 {%k1}
    142 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    143 ; CHECK-NEXT:    retq
    144   %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask)
    145   ret <8 x double> %res
    146 }
    147 
    148 define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) {
    149 ; CHECK-LABEL: test_maskz_expand_pd_512:
    150 ; CHECK:       ## %bb.0:
    151 ; CHECK-NEXT:    kmovw %edi, %k1
    152 ; CHECK-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
    153 ; CHECK-NEXT:    retq
    154   %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
    155   ret <8 x double> %res
    156 }
    157 
    158 declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
    159 
    160 define <16 x float> @test_expand_ps_512(<16 x float> %data) {
    161 ; CHECK-LABEL: test_expand_ps_512:
    162 ; CHECK:       ## %bb.0:
    163 ; CHECK-NEXT:    retq
    164   %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> undef, i16 -1)
    165   ret <16 x float> %res
    166 }
    167 
    168 define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
    169 ; CHECK-LABEL: test_mask_expand_ps_512:
    170 ; CHECK:       ## %bb.0:
    171 ; CHECK-NEXT:    kmovw %edi, %k1
    172 ; CHECK-NEXT:    vexpandps %zmm0, %zmm1 {%k1}
    173 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    174 ; CHECK-NEXT:    retq
    175   %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask)
    176   ret <16 x float> %res
    177 }
    178 
    179 define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) {
    180 ; CHECK-LABEL: test_maskz_expand_ps_512:
    181 ; CHECK:       ## %bb.0:
    182 ; CHECK-NEXT:    kmovw %edi, %k1
    183 ; CHECK-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
    184 ; CHECK-NEXT:    retq
    185   %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask)
    186   ret <16 x float> %res
    187 }
    188 
    189 declare <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask)
    190 
    191 define <8 x i64> @test_expand_q_512(<8 x i64> %data) {
    192 ; CHECK-LABEL: test_expand_q_512:
    193 ; CHECK:       ## %bb.0:
    194 ; CHECK-NEXT:    retq
    195   %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1)
    196   ret <8 x i64> %res
    197 }
    198 
    199 define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
    200 ; CHECK-LABEL: test_mask_expand_q_512:
    201 ; CHECK:       ## %bb.0:
    202 ; CHECK-NEXT:    kmovw %edi, %k1
    203 ; CHECK-NEXT:    vpexpandq %zmm0, %zmm1 {%k1}
    204 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    205 ; CHECK-NEXT:    retq
    206   %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)
    207   ret <8 x i64> %res
    208 }
    209 
    210 define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) {
    211 ; CHECK-LABEL: test_maskz_expand_q_512:
    212 ; CHECK:       ## %bb.0:
    213 ; CHECK-NEXT:    kmovw %edi, %k1
    214 ; CHECK-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
    215 ; CHECK-NEXT:    retq
    216   %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask)
    217   ret <8 x i64> %res
    218 }
    219 
    220 declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask)
    221 
    222 define <16 x i32> @test_expand_d_512(<16 x i32> %data) {
    223 ; CHECK-LABEL: test_expand_d_512:
    224 ; CHECK:       ## %bb.0:
    225 ; CHECK-NEXT:    retq
    226   %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1)
    227   ret <16 x i32> %res
    228 }
    229 
    230 define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
    231 ; CHECK-LABEL: test_mask_expand_d_512:
    232 ; CHECK:       ## %bb.0:
    233 ; CHECK-NEXT:    kmovw %edi, %k1
    234 ; CHECK-NEXT:    vpexpandd %zmm0, %zmm1 {%k1}
    235 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    236 ; CHECK-NEXT:    retq
    237   %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)
    238   ret <16 x i32> %res
    239 }
    240 
    241 define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) {
    242 ; CHECK-LABEL: test_maskz_expand_d_512:
    243 ; CHECK:       ## %bb.0:
    244 ; CHECK-NEXT:    kmovw %edi, %k1
    245 ; CHECK-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
    246 ; CHECK-NEXT:    retq
    247   %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask)
    248   ret <16 x i32> %res
    249 }
    250 
    251 declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
    252 
    253 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
    254 ; CHECK-LABEL: test_rcp_ps_512:
    255 ; CHECK:       ## %bb.0:
    256 ; CHECK-NEXT:    vrcp14ps %zmm0, %zmm0
    257 ; CHECK-NEXT:    retq
    258   %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
    259   ret <16 x float> %res
    260 }
    261 declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
    262 
    263 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
    264 ; CHECK-LABEL: test_rcp_pd_512:
    265 ; CHECK:       ## %bb.0:
    266 ; CHECK-NEXT:    vrcp14pd %zmm0, %zmm0
    267 ; CHECK-NEXT:    retq
    268   %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
    269   ret <8 x double> %res
    270 }
    271 declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
    272 
    273 declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
    274 
    275 define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) {
    276 ; CHECK-LABEL: test_rndscale_sd:
    277 ; CHECK:       ## %bb.0:
    278 ; CHECK-NEXT:    vroundsd $11, %xmm1, %xmm0, %xmm0
    279 ; CHECK-NEXT:    retq
    280   %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
    281   ret <2 x double>%res
    282 }
    283 
    284 define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
    285 ; CHECK-LABEL: test_rndscale_sd_mask:
    286 ; CHECK:       ## %bb.0:
    287 ; CHECK-NEXT:    kmovw %edi, %k1
    288 ; CHECK-NEXT:    vrndscalesd $11, %xmm1, %xmm0, %xmm2 {%k1}
    289 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
    290 ; CHECK-NEXT:    retq
    291   %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
    292   ret <2 x double>%res
    293 }
    294 
    295 define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, <2 x double>* %bptr, <2 x double> %c, i8 %mask) {
    296 ; CHECK-LABEL: test_rndscale_sd_mask_load:
    297 ; CHECK:       ## %bb.0:
    298 ; CHECK-NEXT:    kmovw %esi, %k1
    299 ; CHECK-NEXT:    vrndscalesd $11, (%rdi), %xmm0, %xmm1 {%k1}
    300 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
    301 ; CHECK-NEXT:    retq
    302   %b = load <2 x double>, <2 x double>* %bptr
    303   %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
    304   ret <2 x double>%res
    305 }
    306 
    307 define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) {
    308 ; CHECK-LABEL: test_rndscale_sd_maskz:
    309 ; CHECK:       ## %bb.0:
    310 ; CHECK-NEXT:    kmovw %edi, %k1
    311 ; CHECK-NEXT:    vrndscalesd $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
    312 ; CHECK-NEXT:    retq
    313   %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4)
    314   ret <2 x double>%res
    315 }
    316 
    317 declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
    318 
    319 define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) {
    320 ; CHECK-LABEL: test_rndscale_ss:
    321 ; CHECK:       ## %bb.0:
    322 ; CHECK-NEXT:    vroundss $11, %xmm1, %xmm0, %xmm0
    323 ; CHECK-NEXT:    retq
    324   %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
    325   ret <4 x float>%res
    326 }
    327 
    328 define <4 x float> @test_rndscale_ss_load(<4 x float> %a, <4 x float>* %bptr) {
    329 ; CHECK-LABEL: test_rndscale_ss_load:
    330 ; CHECK:       ## %bb.0:
    331 ; CHECK-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
    332 ; CHECK-NEXT:    retq
    333   %b = load <4 x float>, <4 x float>* %bptr
    334   %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
    335   ret <4 x float>%res
    336 }
    337 
    338 define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
    339 ; CHECK-LABEL: test_rndscale_ss_mask:
    340 ; CHECK:       ## %bb.0:
    341 ; CHECK-NEXT:    kmovw %edi, %k1
    342 ; CHECK-NEXT:    vrndscaless $11, %xmm1, %xmm0, %xmm2 {%k1}
    343 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
    344 ; CHECK-NEXT:    retq
    345   %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4)
    346   ret <4 x float>%res
    347 }
    348 
    349 define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) {
    350 ; CHECK-LABEL: test_rndscale_ss_maskz:
    351 ; CHECK:       ## %bb.0:
    352 ; CHECK-NEXT:    kmovw %edi, %k1
    353 ; CHECK-NEXT:    vrndscaless $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
    354 ; CHECK-NEXT:    retq
    355   %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4)
    356   ret <4 x float>%res
    357 }
    358 
    359 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
    360 
    361 define <8 x double> @test7(<8 x double> %a) {
    362 ; CHECK-LABEL: test7:
    363 ; CHECK:       ## %bb.0:
    364 ; CHECK-NEXT:    vrndscalepd $11, %zmm0, %zmm0
    365 ; CHECK-NEXT:    retq
    366   %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
    367   ret <8 x double>%res
    368 }
    369 
    370 declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
    371 
    372 define <16 x float> @test8(<16 x float> %a) {
    373 ; CHECK-LABEL: test8:
    374 ; CHECK:       ## %bb.0:
    375 ; CHECK-NEXT:    vrndscaleps $11, %zmm0, %zmm0
    376 ; CHECK-NEXT:    retq
    377   %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
    378   ret <16 x float>%res
    379 }
    380 
    381 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
    382 ; CHECK-LABEL: test_rsqrt_ps_512:
    383 ; CHECK:       ## %bb.0:
    384 ; CHECK-NEXT:    vrsqrt14ps %zmm0, %zmm0
    385 ; CHECK-NEXT:    retq
    386   %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
    387   ret <16 x float> %res
    388 }
    389 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
    390 
    391 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
    392 ; CHECK-LABEL: test_sqrt_pd_512:
    393 ; CHECK:       ## %bb.0:
    394 ; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
    395 ; CHECK-NEXT:    retq
    396   %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
    397   ret <8 x double> %1
    398 }
    399 
    400 define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
    401 ; CHECK-LABEL: test_mask_sqrt_pd_512:
    402 ; CHECK:       ## %bb.0:
    403 ; CHECK-NEXT:    kmovw %edi, %k1
    404 ; CHECK-NEXT:    vsqrtpd %zmm0, %zmm1 {%k1}
    405 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    406 ; CHECK-NEXT:    retq
    407   %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
    408   %2 = bitcast i8 %mask to <8 x i1>
    409   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
    410   ret <8 x double> %3
    411 }
    412 
    413 define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) {
    414 ; CHECK-LABEL: test_maskz_sqrt_pd_512:
    415 ; CHECK:       ## %bb.0:
    416 ; CHECK-NEXT:    kmovw %edi, %k1
    417 ; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0 {%k1} {z}
    418 ; CHECK-NEXT:    retq
    419   %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
    420   %2 = bitcast i8 %mask to <8 x i1>
    421   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
    422   ret <8 x double> %3
    423 }
    424 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
    425 
    426 define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) {
    427 ; CHECK-LABEL: test_sqrt_round_pd_512:
    428 ; CHECK:       ## %bb.0:
    429 ; CHECK-NEXT:    vsqrtpd {rz-sae}, %zmm0, %zmm0
    430 ; CHECK-NEXT:    retq
    431   %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
    432   ret <8 x double> %1
    433 }
    434 
    435 define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
    436 ; CHECK-LABEL: test_mask_sqrt_round_pd_512:
    437 ; CHECK:       ## %bb.0:
    438 ; CHECK-NEXT:    kmovw %edi, %k1
    439 ; CHECK-NEXT:    vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1}
    440 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    441 ; CHECK-NEXT:    retq
    442   %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
    443   %2 = bitcast i8 %mask to <8 x i1>
    444   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
    445   ret <8 x double> %3
    446 }
    447 
    448 define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) {
    449 ; CHECK-LABEL: test_maskz_sqrt_round_pd_512:
    450 ; CHECK:       ## %bb.0:
    451 ; CHECK-NEXT:    kmovw %edi, %k1
    452 ; CHECK-NEXT:    vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z}
    453 ; CHECK-NEXT:    retq
    454   %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
    455   %2 = bitcast i8 %mask to <8 x i1>
    456   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
    457   ret <8 x double> %3
    458 }
    459 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone
    460 
    461 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
    462 ; CHECK-LABEL: test_sqrt_ps_512:
    463 ; CHECK:       ## %bb.0:
    464 ; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
    465 ; CHECK-NEXT:    retq
    466   %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
    467   ret <16 x float> %1
    468 }
    469 
    470 define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
    471 ; CHECK-LABEL: test_mask_sqrt_ps_512:
    472 ; CHECK:       ## %bb.0:
    473 ; CHECK-NEXT:    kmovw %edi, %k1
    474 ; CHECK-NEXT:    vsqrtps %zmm0, %zmm1 {%k1}
    475 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    476 ; CHECK-NEXT:    retq
    477   %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
    478   %2 = bitcast i16 %mask to <16 x i1>
    479   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
    480   ret <16 x float> %3
    481 }
    482 
    483 define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) {
    484 ; CHECK-LABEL: test_maskz_sqrt_ps_512:
    485 ; CHECK:       ## %bb.0:
    486 ; CHECK-NEXT:    kmovw %edi, %k1
    487 ; CHECK-NEXT:    vsqrtps %zmm0, %zmm0 {%k1} {z}
    488 ; CHECK-NEXT:    retq
    489   %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
    490   %2 = bitcast i16 %mask to <16 x i1>
    491   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
    492   ret <16 x float> %3
    493 }
    494 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
    495 
    496 define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
    497 ; CHECK-LABEL: test_sqrt_round_ps_512:
    498 ; CHECK:       ## %bb.0:
    499 ; CHECK-NEXT:    vsqrtps {rz-sae}, %zmm0, %zmm0
    500 ; CHECK-NEXT:    retq
    501   %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
    502   ret <16 x float> %1
    503 }
    504 
    505 define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
    506 ; CHECK-LABEL: test_mask_sqrt_round_ps_512:
    507 ; CHECK:       ## %bb.0:
    508 ; CHECK-NEXT:    kmovw %edi, %k1
    509 ; CHECK-NEXT:    vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1}
    510 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    511 ; CHECK-NEXT:    retq
    512   %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
    513   %2 = bitcast i16 %mask to <16 x i1>
    514   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
    515   ret <16 x float> %3
    516 }
    517 
    518 define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) {
    519 ; CHECK-LABEL: test_maskz_sqrt_round_ps_512:
    520 ; CHECK:       ## %bb.0:
    521 ; CHECK-NEXT:    kmovw %edi, %k1
    522 ; CHECK-NEXT:    vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z}
    523 ; CHECK-NEXT:    retq
    524   %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
    525   %2 = bitcast i16 %mask to <16 x i1>
    526   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
    527   ret <16 x float> %3
    528 }
    529 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone
    530 
    531 define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
    532 ; CHECK-LABEL: test_getexp_pd_512:
    533 ; CHECK:       ## %bb.0:
    534 ; CHECK-NEXT:    vgetexppd %zmm0, %zmm0
    535 ; CHECK-NEXT:    retq
    536   %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
    537   ret <8 x double> %res
    538 }
    539 define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
    540 ; CHECK-LABEL: test_getexp_round_pd_512:
    541 ; CHECK:       ## %bb.0:
    542 ; CHECK-NEXT:    vgetexppd {sae}, %zmm0, %zmm0
    543 ; CHECK-NEXT:    retq
    544   %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 8)
    545   ret <8 x double> %res
    546 }
    547 declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
    548 
    549 define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
    550 ; CHECK-LABEL: test_getexp_ps_512:
    551 ; CHECK:       ## %bb.0:
    552 ; CHECK-NEXT:    vgetexpps %zmm0, %zmm0
    553 ; CHECK-NEXT:    retq
    554   %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
    555   ret <16 x float> %res
    556 }
    557 
    558 define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
    559 ; CHECK-LABEL: test_getexp_round_ps_512:
    560 ; CHECK:       ## %bb.0:
    561 ; CHECK-NEXT:    vgetexpps {sae}, %zmm0, %zmm0
    562 ; CHECK-NEXT:    retq
    563   %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
    564   ret <16 x float> %res
    565 }
    566 declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
    567 
    568 declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
    569 
    570 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
    571 ; CHECK-LABEL: test_sqrt_ss:
    572 ; CHECK:       ## %bb.0:
    573 ; CHECK-NEXT:    kmovw %edi, %k1
    574 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
    575 ; CHECK-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
    576 ; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
    577 ; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm2
    578 ; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
    579 ; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
    580 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
    581 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
    582 ; CHECK-NEXT:    retq
    583   %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
    584   %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
    585   %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
    586   %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
    587 
    588   %res.1 = fadd <4 x float> %res0, %res1
    589   %res.2 = fadd <4 x float> %res2, %res3
    590   %res   = fadd <4 x float> %res.1, %res.2
    591   ret <4 x float> %res
    592 }
    593 
    594 declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
    595 
    596 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
    597 ; CHECK-LABEL: test_sqrt_sd:
    598 ; CHECK:       ## %bb.0:
    599 ; CHECK-NEXT:    kmovw %edi, %k1
    600 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
    601 ; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
    602 ; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
    603 ; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm2
    604 ; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
    605 ; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
    606 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
    607 ; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
    608 ; CHECK-NEXT:    retq
    609   %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
    610   %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
    611   %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
    612   %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
    613 
    614   %res.1 = fadd <2 x double> %res0, %res1
    615   %res.2 = fadd <2 x double> %res2, %res3
    616   %res   = fadd <2 x double> %res.1, %res.2
    617   ret <2 x double> %res
    618 }
    619 
    620 define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
    621 ; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
    622 ; CHECK:       ## %bb.0:
    623 ; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
    624 ; CHECK-NEXT:    retq
    625   %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
    626   ret i64 %res
    627 }
    628 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
    629 
    630 define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
    631 ; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
    632 ; CHECK:       ## %bb.0:
    633 ; CHECK-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
    634 ; CHECK-NEXT:    retq
    635   %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
    636   ret <2 x double> %res
    637 }
    638 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
    639 
    640 define i64 @test_x86_avx512_cvttsd2si64(<2 x double> %a0) {
    641 ; CHECK-LABEL: test_x86_avx512_cvttsd2si64:
    642 ; CHECK:       ## %bb.0:
    643 ; CHECK-NEXT:    vcvttsd2si %xmm0, %rcx
    644 ; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %rax
    645 ; CHECK-NEXT:    addq %rcx, %rax
    646 ; CHECK-NEXT:    retq
    647   %res0 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 4) ;
    648   %res1 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 8) ;
    649   %res2 = add i64 %res0, %res1
    650   ret i64 %res2
    651 }
    652 declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) nounwind readnone
    653 
    654 define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
    655 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
    656 ; CHECK:       ## %bb.0:
    657 ; CHECK-NEXT:    vcvttsd2usi %xmm0, %ecx
    658 ; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %eax
    659 ; CHECK-NEXT:    addl %ecx, %eax
    660 ; CHECK-NEXT:    retq
    661   %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
    662   %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
    663   %res2 = add i32 %res0, %res1
    664   ret i32 %res2
    665 }
    666 declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
    667 
    668 define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
    669 ; CHECK-LABEL: test_x86_avx512_cvttsd2si:
    670 ; CHECK:       ## %bb.0:
    671 ; CHECK-NEXT:    vcvttsd2si %xmm0, %ecx
    672 ; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %eax
    673 ; CHECK-NEXT:    addl %ecx, %eax
    674 ; CHECK-NEXT:    retq
    675   %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
    676   %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
    677   %res2 = add i32 %res0, %res1
    678   ret i32 %res2
    679 }
    680 declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
    681 
    682 
    683 
    684 define i64 @test_x86_avx512_cvttsd2usi64(<2 x double> %a0) {
    685 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi64:
    686 ; CHECK:       ## %bb.0:
    687 ; CHECK-NEXT:    vcvttsd2usi %xmm0, %rcx
    688 ; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %rax
    689 ; CHECK-NEXT:    addq %rcx, %rax
    690 ; CHECK-NEXT:    retq
    691   %res0 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 4) ;
    692   %res1 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 8) ;
    693   %res2 = add i64 %res0, %res1
    694   ret i64 %res2
    695 }
    696 declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) nounwind readnone
    697 
    698 define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
    699 ; CHECK-LABEL: test_x86_sse_cvtss2si64:
    700 ; CHECK:       ## %bb.0:
    701 ; CHECK-NEXT:    vcvtss2si %xmm0, %rax
    702 ; CHECK-NEXT:    retq
    703   %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
    704   ret i64 %res
    705 }
    706 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
    707 
    708 
    709 define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
    710 ; CHECK-LABEL: test_x86_sse_cvtsi642ss:
    711 ; CHECK:       ## %bb.0:
    712 ; CHECK-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
    713 ; CHECK-NEXT:    retq
    714   %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
    715   ret <4 x float> %res
    716 }
    717 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
    718 
    719 
    720 define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
    721 ; CHECK-LABEL: test_x86_avx512_cvttss2si:
    722 ; CHECK:       ## %bb.0:
    723 ; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %ecx
    724 ; CHECK-NEXT:    vcvttss2si %xmm0, %eax
    725 ; CHECK-NEXT:    addl %ecx, %eax
    726 ; CHECK-NEXT:    retq
    727   %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
    728   %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
    729   %res2 = add i32 %res0, %res1
    730   ret i32 %res2
    731 }
    732 declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
    733 
    734 define i32 @test_x86_avx512_cvttss2si_load(<4 x float>* %a0) {
    735 ; CHECK-LABEL: test_x86_avx512_cvttss2si_load:
    736 ; CHECK:       ## %bb.0:
    737 ; CHECK-NEXT:    vcvttss2si (%rdi), %eax
    738 ; CHECK-NEXT:    retq
    739   %a1 = load <4 x float>, <4 x float>* %a0
    740   %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ;
    741   ret i32 %res
    742 }
    743 
    744 define i64 @test_x86_avx512_cvttss2si64(<4 x float> %a0) {
    745 ; CHECK-LABEL: test_x86_avx512_cvttss2si64:
    746 ; CHECK:       ## %bb.0:
    747 ; CHECK-NEXT:    vcvttss2si %xmm0, %rcx
    748 ; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %rax
    749 ; CHECK-NEXT:    addq %rcx, %rax
    750 ; CHECK-NEXT:    retq
    751   %res0 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 4) ;
    752   %res1 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 8) ;
    753   %res2 = add i64 %res0, %res1
    754   ret i64 %res2
    755 }
    756 declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32) nounwind readnone
    757 
    758 define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
    759 ; CHECK-LABEL: test_x86_avx512_cvttss2usi:
    760 ; CHECK:       ## %bb.0:
    761 ; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %ecx
    762 ; CHECK-NEXT:    vcvttss2usi %xmm0, %eax
    763 ; CHECK-NEXT:    addl %ecx, %eax
    764 ; CHECK-NEXT:    retq
    765   %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
    766   %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
    767   %res2 = add i32 %res0, %res1
    768   ret i32 %res2
    769 }
    770 declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
    771 
    772 define i64 @test_x86_avx512_cvttss2usi64(<4 x float> %a0) {
    773 ; CHECK-LABEL: test_x86_avx512_cvttss2usi64:
    774 ; CHECK:       ## %bb.0:
    775 ; CHECK-NEXT:    vcvttss2usi %xmm0, %rcx
    776 ; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %rax
    777 ; CHECK-NEXT:    addq %rcx, %rax
    778 ; CHECK-NEXT:    retq
    779   %res0 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 4) ;
    780   %res1 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 8) ;
    781   %res2 = add i64 %res0, %res1
    782   ret i64 %res2
    783 }
    784 declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
    785 
    786 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
    787 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
    788 ; CHECK:       ## %bb.0:
    789 ; CHECK-NEXT:    vcvtsd2usi %xmm0, %rax
    790 ; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rcx
    791 ; CHECK-NEXT:    addq %rax, %rcx
    792 ; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %rax
    793 ; CHECK-NEXT:    addq %rcx, %rax
    794 ; CHECK-NEXT:    retq
    795 
    796   %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
    797   %res1 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 3)
    798   %res2 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 1)
    799   %res3 = add i64 %res, %res1
    800   %res4 = add i64 %res3, %res2
    801   ret i64 %res4
    802 }
    803 declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
    804 
    805 define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
    806 ; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
    807 ; CHECK:       ## %bb.0:
    808 ; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
    809 ; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rcx
    810 ; CHECK-NEXT:    addq %rax, %rcx
    811 ; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %rax
    812 ; CHECK-NEXT:    addq %rcx, %rax
    813 ; CHECK-NEXT:    retq
    814 
    815   %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
    816   %res1 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 3)
    817   %res2 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 1)
    818   %res3 = add i64 %res, %res1
    819   %res4 = add i64 %res3, %res2
    820   ret i64 %res4
    821 }
    822 declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
    823 
    824 define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
    825 ; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
    826 ; CHECK:       ## %bb.0:
    827 ; CHECK-NEXT:    vcvtss2usi %xmm0, %rax
    828 ; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rcx
    829 ; CHECK-NEXT:    addq %rax, %rcx
    830 ; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %rax
    831 ; CHECK-NEXT:    addq %rcx, %rax
    832 ; CHECK-NEXT:    retq
    833 
    834   %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
    835   %res1 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 3)
    836   %res2 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 1)
    837   %res3 = add i64 %res, %res1
    838   %res4 = add i64 %res3, %res2
    839   ret i64 %res4
    840 }
    841 declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
    842 
    843 define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
    844 ; CHECK-LABEL: test_x86_avx512_cvtss2si64:
    845 ; CHECK:       ## %bb.0:
    846 ; CHECK-NEXT:    vcvtss2si %xmm0, %rax
    847 ; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rcx
    848 ; CHECK-NEXT:    addq %rax, %rcx
    849 ; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %rax
    850 ; CHECK-NEXT:    addq %rcx, %rax
    851 ; CHECK-NEXT:    retq
    852 
    853   %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
    854   %res1 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 3)
    855   %res2 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 1)
    856   %res3 = add i64 %res, %res1
    857   %res4 = add i64 %res3, %res2
    858   ret i64 %res4
    859 }
    860 declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
    861 
    862 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
    863 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
    864 ; CHECK:       ## %bb.0:
    865 ; CHECK-NEXT:    vcvtsd2usi %xmm0, %eax
    866 ; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %ecx
    867 ; CHECK-NEXT:    addl %eax, %ecx
    868 ; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %eax
    869 ; CHECK-NEXT:    addl %ecx, %eax
    870 ; CHECK-NEXT:    retq
    871 
    872   %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
    873   %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 3)
    874   %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 1)
    875   %res3 = add i32 %res, %res1
    876   %res4 = add i32 %res3, %res2
    877   ret i32 %res4
    878 }
    879 declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
    880 
    881 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
    882 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
    883 ; CHECK:       ## %bb.0:
    884 ; CHECK-NEXT:    vcvtsd2si %xmm0, %eax
    885 ; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %ecx
    886 ; CHECK-NEXT:    addl %eax, %ecx
    887 ; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %eax
    888 ; CHECK-NEXT:    addl %ecx, %eax
    889 ; CHECK-NEXT:    retq
    890 
    891   %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
    892   %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 3)
    893   %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 1)
    894   %res3 = add i32 %res, %res1
    895   %res4 = add i32 %res3, %res2
    896   ret i32 %res4
    897 }
    898 declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
    899 
    900 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
    901 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
    902 ; CHECK:       ## %bb.0:
    903 ; CHECK-NEXT:    vcvtss2usi %xmm0, %eax
    904 ; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %ecx
    905 ; CHECK-NEXT:    addl %eax, %ecx
    906 ; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %eax
    907 ; CHECK-NEXT:    addl %ecx, %eax
    908 ; CHECK-NEXT:    retq
    909 
    910   %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
    911   %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 3)
    912   %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 1)
    913   %res3 = add i32 %res, %res1
    914   %res4 = add i32 %res3, %res2
    915   ret i32 %res4
    916 }
    917 declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
    918 
    919 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
    920 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
    921 ; CHECK:       ## %bb.0:
    922 ; CHECK-NEXT:    vcvtss2si %xmm0, %eax
    923 ; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %ecx
    924 ; CHECK-NEXT:    addl %eax, %ecx
    925 ; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %eax
    926 ; CHECK-NEXT:    addl %ecx, %eax
    927 ; CHECK-NEXT:    retq
    928 
    929   %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
    930   %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 3)
    931   %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 1)
    932   %res3 = add i32 %res, %res1
    933   %res4 = add i32 %res3, %res2
    934   ret i32 %res4
    935 }
    936 declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
    937 
    938 define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
    939 ; CHECK-LABEL: test_x86_vcvtph2ps_512:
    940 ; CHECK:       ## %bb.0:
    941 ; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
    942 ; CHECK-NEXT:    retq
    943   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
    944   ret <16 x float> %res
    945 }
    946 
    947 define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
    948 ; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
    949 ; CHECK:       ## %bb.0:
    950 ; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0
    951 ; CHECK-NEXT:    retq
    952   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
    953   ret <16 x float> %res
    954 }
    955 
    956 define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
    957 ; CHECK-LABEL: test_x86_vcvtph2ps_512_rrk:
    958 ; CHECK:       ## %bb.0:
    959 ; CHECK-NEXT:    kmovw %edi, %k1
    960 ; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1}
    961 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    962 ; CHECK-NEXT:    retq
    963   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
    964   ret <16 x float> %res
    965 }
    966 
    967 define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
    968 ; CHECK-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
    969 ; CHECK:       ## %bb.0:
    970 ; CHECK-NEXT:    kmovw %edi, %k1
    971 ; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
    972 ; CHECK-NEXT:    retq
    973   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
    974   ret <16 x float> %res
    975 }
    976 
    977 define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
    978 ; CHECK-LABEL: test_x86_vcvtph2ps_512_rrkz:
    979 ; CHECK:       ## %bb.0:
    980 ; CHECK-NEXT:    kmovw %edi, %k1
    981 ; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
    982 ; CHECK-NEXT:    retq
    983   %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
    984   ret <16 x float> %res
    985 }
    986 
    987 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
    988 
    989 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) {
    990 ; CHECK-LABEL: test_x86_vcvtps2ph_256:
    991 ; CHECK:       ## %bb.0:
    992 ; CHECK-NEXT:    kmovw %edi, %k1
    993 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm1 {%k1}
    994 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
    995 ; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
    996 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, (%rsi)
    997 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
    998 ; CHECK-NEXT:    retq
    999   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
   1000   %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
   1001   %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask)
   1002   store <16 x i16> %res1, <16 x i16> * %dst
   1003   %res  = add <16 x i16> %res2, %res3
   1004   ret <16 x i16> %res
   1005 }
   1006 
   1007 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
   1008 
   1009 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
   1010 ; CHECK-LABEL: test_cmpps:
   1011 ; CHECK:       ## %bb.0:
   1012 ; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
   1013 ; CHECK-NEXT:    kmovw %k0, %eax
   1014 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
   1015 ; CHECK-NEXT:    vzeroupper
   1016 ; CHECK-NEXT:    retq
   1017   %res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
   1018   %1 = bitcast <16 x i1> %res to i16
   1019   ret i16 %1
   1020 }
   1021 declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
   1022 
   1023 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
   1024 ; CHECK-LABEL: test_cmppd:
   1025 ; CHECK:       ## %bb.0:
   1026 ; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
   1027 ; CHECK-NEXT:    kmovw %k0, %eax
   1028 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
   1029 ; CHECK-NEXT:    vzeroupper
   1030 ; CHECK-NEXT:    retq
   1031   %res = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4)
   1032   %1 = bitcast <8 x i1> %res to i8
   1033   ret i8 %1
   1034 }
   1035 declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
   1036 
   1037 ; Function Attrs: nounwind readnone
   1038 
   1039  ; fp min - max
   1040 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
   1041 ; CHECK-LABEL: test_vmaxpd:
   1042 ; CHECK:       ## %bb.0:
   1043 ; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
   1044 ; CHECK-NEXT:    retq
   1045   %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
   1046   ret <8 x double> %1
   1047 }
   1048 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
   1049 
   1050 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
   1051 ; CHECK-LABEL: test_vminpd:
   1052 ; CHECK:       ## %bb.0:
   1053 ; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
   1054 ; CHECK-NEXT:    retq
   1055   %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
   1056   ret <8 x double> %1
   1057 }
   1058 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
   1059 
   1060 define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
   1061 ; CHECK-LABEL: test_mask_store_ss:
   1062 ; CHECK:       ## %bb.0:
   1063 ; CHECK-NEXT:    kmovw %esi, %k1
   1064 ; CHECK-NEXT:    vmovss %xmm0, (%rdi) {%k1}
   1065 ; CHECK-NEXT:    retq
   1066   %1 = and i8 %mask, 1
   1067   %2 = bitcast i8* %ptr to <4 x float>*
   1068   %3 = bitcast i8 %1 to <8 x i1>
   1069   %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1070   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %data, <4 x float>* %2, i32 1, <4 x i1> %extract)
   1071   ret void
   1072 }
   1073 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) #1
   1074 
   1075 
   1076 declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
   1077 declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
   1078 declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
   1079 
   1080 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
   1081 ; CHECK-LABEL: test_vsubps_rn:
   1082 ; CHECK:       ## %bb.0:
   1083 ; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
   1084 ; CHECK-NEXT:    retq
   1085   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1086   ret <16 x float> %1
   1087 }
   1088 
   1089 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
   1090 ; CHECK-LABEL: test_vsubps_rd:
   1091 ; CHECK:       ## %bb.0:
   1092 ; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
   1093 ; CHECK-NEXT:    retq
   1094   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1095   ret <16 x float> %1
   1096 }
   1097 
   1098 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
   1099 ; CHECK-LABEL: test_vsubps_ru:
   1100 ; CHECK:       ## %bb.0:
   1101 ; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
   1102 ; CHECK-NEXT:    retq
   1103   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1104   ret <16 x float> %1
   1105 }
   1106 
   1107 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
   1108 ; CHECK-LABEL: test_vsubps_rz:
   1109 ; CHECK:       ## %bb.0:
   1110 ; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
   1111 ; CHECK-NEXT:    retq
   1112   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1113   ret <16 x float> %1
   1114 }
   1115 
   1116 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
   1117 ; CHECK-LABEL: test_vmulps_rn:
   1118 ; CHECK:       ## %bb.0:
   1119 ; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
   1120 ; CHECK-NEXT:    retq
   1121   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1122   ret <16 x float> %1
   1123 }
   1124 
   1125 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
   1126 ; CHECK-LABEL: test_vmulps_rd:
   1127 ; CHECK:       ## %bb.0:
   1128 ; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
   1129 ; CHECK-NEXT:    retq
   1130   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1131   ret <16 x float> %1
   1132 }
   1133 
   1134 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
   1135 ; CHECK-LABEL: test_vmulps_ru:
   1136 ; CHECK:       ## %bb.0:
   1137 ; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
   1138 ; CHECK-NEXT:    retq
   1139   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1140   ret <16 x float> %1
   1141 }
   1142 
   1143 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
   1144 ; CHECK-LABEL: test_vmulps_rz:
   1145 ; CHECK:       ## %bb.0:
   1146 ; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
   1147 ; CHECK-NEXT:    retq
   1148   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1149   ret <16 x float> %1
   1150 }
   1151 
   1152 ;; mask float
   1153 define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1154 ; CHECK-LABEL: test_vmulps_mask_rn:
   1155 ; CHECK:       ## %bb.0:
   1156 ; CHECK-NEXT:    kmovw %edi, %k1
   1157 ; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1158 ; CHECK-NEXT:    retq
   1159   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1160   %2 = bitcast i16 %mask to <16 x i1>
   1161   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1162   ret <16 x float> %3
   1163 }
   1164 
   1165 define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1166 ; CHECK-LABEL: test_vmulps_mask_rd:
   1167 ; CHECK:       ## %bb.0:
   1168 ; CHECK-NEXT:    kmovw %edi, %k1
   1169 ; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1170 ; CHECK-NEXT:    retq
   1171   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1172   %2 = bitcast i16 %mask to <16 x i1>
   1173   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1174   ret <16 x float> %3
   1175 }
   1176 
   1177 define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1178 ; CHECK-LABEL: test_vmulps_mask_ru:
   1179 ; CHECK:       ## %bb.0:
   1180 ; CHECK-NEXT:    kmovw %edi, %k1
   1181 ; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1182 ; CHECK-NEXT:    retq
   1183   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1184   %2 = bitcast i16 %mask to <16 x i1>
   1185   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1186   ret <16 x float> %3
   1187 }
   1188 
   1189 define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1190 ; CHECK-LABEL: test_vmulps_mask_rz:
   1191 ; CHECK:       ## %bb.0:
   1192 ; CHECK-NEXT:    kmovw %edi, %k1
   1193 ; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1194 ; CHECK-NEXT:    retq
   1195   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1196   %2 = bitcast i16 %mask to <16 x i1>
   1197   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1198   ret <16 x float> %3
   1199 }
   1200 
   1201 ;; With Passthru value
   1202 define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
   1203 ; CHECK-LABEL: test_vmulps_mask_passthru_rn:
   1204 ; CHECK:       ## %bb.0:
   1205 ; CHECK-NEXT:    kmovw %edi, %k1
   1206 ; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1207 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1208 ; CHECK-NEXT:    retq
   1209   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1210   %2 = bitcast i16 %mask to <16 x i1>
   1211   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
   1212   ret <16 x float> %3
   1213 }
   1214 
   1215 define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
   1216 ; CHECK-LABEL: test_vmulps_mask_passthru_rd:
   1217 ; CHECK:       ## %bb.0:
   1218 ; CHECK-NEXT:    kmovw %edi, %k1
   1219 ; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1220 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1221 ; CHECK-NEXT:    retq
   1222   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1223   %2 = bitcast i16 %mask to <16 x i1>
   1224   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
   1225   ret <16 x float> %3
   1226 }
   1227 
   1228 define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
   1229 ; CHECK-LABEL: test_vmulps_mask_passthru_ru:
   1230 ; CHECK:       ## %bb.0:
   1231 ; CHECK-NEXT:    kmovw %edi, %k1
   1232 ; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1233 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1234 ; CHECK-NEXT:    retq
   1235   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1236   %2 = bitcast i16 %mask to <16 x i1>
   1237   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
   1238   ret <16 x float> %3
   1239 }
   1240 
   1241 define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
   1242 ; CHECK-LABEL: test_vmulps_mask_passthru_rz:
   1243 ; CHECK:       ## %bb.0:
   1244 ; CHECK-NEXT:    kmovw %edi, %k1
   1245 ; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1246 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1247 ; CHECK-NEXT:    retq
   1248   %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1249   %2 = bitcast i16 %mask to <16 x i1>
   1250   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
   1251   ret <16 x float> %3
   1252 }
   1253 
   1254 ;; mask double
   1255 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
   1256 ; CHECK-LABEL: test_vmulpd_mask_rn:
   1257 ; CHECK:       ## %bb.0:
   1258 ; CHECK-NEXT:    kmovw %edi, %k1
   1259 ; CHECK-NEXT:    vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1260 ; CHECK-NEXT:    retq
   1261   %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0)
   1262   %2 = bitcast i8 %mask to <8 x i1>
   1263   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
   1264   ret <8 x double> %3
   1265 }
   1266 
   1267 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
   1268 ; CHECK-LABEL: test_vmulpd_mask_rd:
   1269 ; CHECK:       ## %bb.0:
   1270 ; CHECK-NEXT:    kmovw %edi, %k1
   1271 ; CHECK-NEXT:    vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1272 ; CHECK-NEXT:    retq
   1273   %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 1)
   1274   %2 = bitcast i8 %mask to <8 x i1>
   1275   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
   1276   ret <8 x double> %3
   1277 }
   1278 
   1279 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
   1280 ; CHECK-LABEL: test_vmulpd_mask_ru:
   1281 ; CHECK:       ## %bb.0:
   1282 ; CHECK-NEXT:    kmovw %edi, %k1
   1283 ; CHECK-NEXT:    vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1284 ; CHECK-NEXT:    retq
   1285   %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 2)
   1286   %2 = bitcast i8 %mask to <8 x i1>
   1287   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
   1288   ret <8 x double> %3
   1289 }
   1290 
   1291 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
   1292 ; CHECK-LABEL: test_vmulpd_mask_rz:
   1293 ; CHECK:       ## %bb.0:
   1294 ; CHECK-NEXT:    kmovw %edi, %k1
   1295 ; CHECK-NEXT:    vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1296 ; CHECK-NEXT:    retq
   1297   %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 3)
   1298   %2 = bitcast i8 %mask to <8 x i1>
   1299   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
   1300   ret <8 x double> %3
   1301 }
   1302 
   1303 define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1304 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
   1305 ; CHECK:       ## %bb.0:
   1306 ; CHECK-NEXT:    kmovw %edi, %k1
   1307 ; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1308 ; CHECK-NEXT:    retq
   1309   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1310   %2 = bitcast i16 %mask to <16 x i1>
   1311   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1312   ret <16 x float> %3
   1313 }
   1314 
   1315 define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1316 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
   1317 ; CHECK:       ## %bb.0:
   1318 ; CHECK-NEXT:    kmovw %edi, %k1
   1319 ; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1320 ; CHECK-NEXT:    retq
   1321   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1322   %2 = bitcast i16 %mask to <16 x i1>
   1323   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1324   ret <16 x float> %3
   1325 }
   1326 
   1327 define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1328 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
   1329 ; CHECK:       ## %bb.0:
   1330 ; CHECK-NEXT:    kmovw %edi, %k1
   1331 ; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1332 ; CHECK-NEXT:    retq
   1333   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1334   %2 = bitcast i16 %mask to <16 x i1>
   1335   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1336   ret <16 x float> %3
   1337 }
   1338 
   1339 define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1340 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
   1341 ; CHECK:       ## %bb.0:
   1342 ; CHECK-NEXT:    kmovw %edi, %k1
   1343 ; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1344 ; CHECK-NEXT:    retq
   1345   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1346   %2 = bitcast i16 %mask to <16 x i1>
   1347   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1348   ret <16 x float> %3
   1349 }
   1350 
   1351 define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1352 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_current:
   1353 ; CHECK:       ## %bb.0:
   1354 ; CHECK-NEXT:    kmovw %edi, %k1
   1355 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
   1356 ; CHECK-NEXT:    retq
   1357   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1358   %2 = bitcast i16 %mask to <16 x i1>
   1359   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1360   ret <16 x float> %3
   1361 }
   1362 
   1363 define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1364 ; CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae:
   1365 ; CHECK:       ## %bb.0:
   1366 ; CHECK-NEXT:    kmovw %edi, %k1
   1367 ; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1368 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1369 ; CHECK-NEXT:    retq
   1370   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1371   %2 = bitcast i16 %mask to <16 x i1>
   1372   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1373   ret <16 x float> %3
   1374 }
   1375 
   1376 define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1377 ; CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae:
   1378 ; CHECK:       ## %bb.0:
   1379 ; CHECK-NEXT:    kmovw %edi, %k1
   1380 ; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1381 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1382 ; CHECK-NEXT:    retq
   1383   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1384   %2 = bitcast i16 %mask to <16 x i1>
   1385   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1386   ret <16 x float> %3
   1387 }
   1388 
   1389 define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1390 ; CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae:
   1391 ; CHECK:       ## %bb.0:
   1392 ; CHECK-NEXT:    kmovw %edi, %k1
   1393 ; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1394 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1395 ; CHECK-NEXT:    retq
   1396   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1397   %2 = bitcast i16 %mask to <16 x i1>
   1398   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1399   ret <16 x float> %3
   1400 }
   1401 
   1402 define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1403 ; CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae:
   1404 ; CHECK:       ## %bb.0:
   1405 ; CHECK-NEXT:    kmovw %edi, %k1
   1406 ; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1407 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1408 ; CHECK-NEXT:    retq
   1409   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1410   %2 = bitcast i16 %mask to <16 x i1>
   1411   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1412   ret <16 x float> %3
   1413 }
   1414 
   1415 define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1416 ; CHECK-LABEL: test_mm512_mask_add_round_ps_current:
   1417 ; CHECK:       ## %bb.0:
   1418 ; CHECK-NEXT:    kmovw %edi, %k1
   1419 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm2 {%k1}
   1420 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1421 ; CHECK-NEXT:    retq
   1422   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1423   %2 = bitcast i16 %mask to <16 x i1>
   1424   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1425   ret <16 x float> %3
   1426 }
   1427 
   1428 define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1429 ; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
   1430 ; CHECK:       ## %bb.0:
   1431 ; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
   1432 ; CHECK-NEXT:    retq
   1433   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1434   ret <16 x float> %1
   1435 }
   1436 
   1437 define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1438 ; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
   1439 ; CHECK:       ## %bb.0:
   1440 ; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
   1441 ; CHECK-NEXT:    retq
   1442   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1443   ret <16 x float> %1
   1444 }
   1445 
   1446 define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1447 ; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
   1448 ; CHECK:       ## %bb.0:
   1449 ; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
   1450 ; CHECK-NEXT:    retq
   1451   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1452   ret <16 x float> %1
   1453 }
   1454 
   1455 define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1456 ; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
   1457 ; CHECK:       ## %bb.0:
   1458 ; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
   1459 ; CHECK-NEXT:    retq
   1460   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1461   ret <16 x float> %1
   1462 }
   1463 
   1464 define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1465 ; CHECK-LABEL: test_mm512_add_round_ps_current:
   1466 ; CHECK:       ## %bb.0:
   1467 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   1468 ; CHECK-NEXT:    retq
   1469   %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1470   ret <16 x float> %1
   1471 }
   1472 declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
   1473 
   1474 define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1475 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
   1476 ; CHECK:       ## %bb.0:
   1477 ; CHECK-NEXT:    kmovw %edi, %k1
   1478 ; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1479 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1480 ; CHECK-NEXT:    retq
   1481   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1482   %2 = bitcast i16 %mask to <16 x i1>
   1483   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1484   ret <16 x float> %3
   1485 }
   1486 
   1487 define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1488 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
   1489 ; CHECK:       ## %bb.0:
   1490 ; CHECK-NEXT:    kmovw %edi, %k1
   1491 ; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1492 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1493 ; CHECK-NEXT:    retq
   1494   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1495   %2 = bitcast i16 %mask to <16 x i1>
   1496   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1497   ret <16 x float> %3
   1498 }
   1499 
   1500 define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1501 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
   1502 ; CHECK:       ## %bb.0:
   1503 ; CHECK-NEXT:    kmovw %edi, %k1
   1504 ; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1505 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1506 ; CHECK-NEXT:    retq
   1507   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1508   %2 = bitcast i16 %mask to <16 x i1>
   1509   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1510   ret <16 x float> %3
   1511 }
   1512 
   1513 define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1514 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
   1515 ; CHECK:       ## %bb.0:
   1516 ; CHECK-NEXT:    kmovw %edi, %k1
   1517 ; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1518 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1519 ; CHECK-NEXT:    retq
   1520   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1521   %2 = bitcast i16 %mask to <16 x i1>
   1522   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1523   ret <16 x float> %3
   1524 }
   1525 
   1526 define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1527 ; CHECK-LABEL: test_mm512_mask_sub_round_ps_current:
   1528 ; CHECK:       ## %bb.0:
   1529 ; CHECK-NEXT:    kmovw %edi, %k1
   1530 ; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm2 {%k1}
   1531 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1532 ; CHECK-NEXT:    retq
   1533   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1534   %2 = bitcast i16 %mask to <16 x i1>
   1535   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1536   ret <16 x float> %3
   1537 }
   1538 
   1539 define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1540 ; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
   1541 ; CHECK:       ## %bb.0:
   1542 ; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
   1543 ; CHECK-NEXT:    retq
   1544   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1545   ret <16 x float> %1
   1546 }
   1547 
   1548 define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1549 ; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
   1550 ; CHECK:       ## %bb.0:
   1551 ; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
   1552 ; CHECK-NEXT:    retq
   1553   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1554   ret <16 x float> %1
   1555 }
   1556 
   1557 define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1558 ; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
   1559 ; CHECK:       ## %bb.0:
   1560 ; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
   1561 ; CHECK-NEXT:    retq
   1562   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1563   ret <16 x float> %1
   1564 }
   1565 
   1566 define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1567 ; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
   1568 ; CHECK:       ## %bb.0:
   1569 ; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
   1570 ; CHECK-NEXT:    retq
   1571   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1572   ret <16 x float> %1
   1573 }
   1574 
   1575 define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1576 ; CHECK-LABEL: test_mm512_sub_round_ps_current:
   1577 ; CHECK:       ## %bb.0:
   1578 ; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm0
   1579 ; CHECK-NEXT:    retq
   1580   %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1581   ret <16 x float> %1
   1582 }
   1583 
   1584 define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1585 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
   1586 ; CHECK:       ## %bb.0:
   1587 ; CHECK-NEXT:    kmovw %edi, %k1
   1588 ; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1589 ; CHECK-NEXT:    retq
   1590   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1591   %2 = bitcast i16 %mask to <16 x i1>
   1592   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1593   ret <16 x float> %3
   1594 }
   1595 
   1596 define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1597 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
   1598 ; CHECK:       ## %bb.0:
   1599 ; CHECK-NEXT:    kmovw %edi, %k1
   1600 ; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1601 ; CHECK-NEXT:    retq
   1602   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1603   %2 = bitcast i16 %mask to <16 x i1>
   1604   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1605   ret <16 x float> %3
   1606 }
   1607 
   1608 define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1609 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
   1610 ; CHECK:       ## %bb.0:
   1611 ; CHECK-NEXT:    kmovw %edi, %k1
   1612 ; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1613 ; CHECK-NEXT:    retq
   1614   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1615   %2 = bitcast i16 %mask to <16 x i1>
   1616   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1617   ret <16 x float> %3
   1618 }
   1619 
   1620 define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1621 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
   1622 ; CHECK:       ## %bb.0:
   1623 ; CHECK-NEXT:    kmovw %edi, %k1
   1624 ; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1625 ; CHECK-NEXT:    retq
   1626   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1627   %2 = bitcast i16 %mask to <16 x i1>
   1628   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1629   ret <16 x float> %3
   1630 }
   1631 
   1632 define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1633 ; CHECK-LABEL: test_mm512_maskz_div_round_ps_current:
   1634 ; CHECK:       ## %bb.0:
   1635 ; CHECK-NEXT:    kmovw %edi, %k1
   1636 ; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
   1637 ; CHECK-NEXT:    retq
   1638   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1639   %2 = bitcast i16 %mask to <16 x i1>
   1640   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1641   ret <16 x float> %3
   1642 }
   1643 
   1644 define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1645 ; CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae:
   1646 ; CHECK:       ## %bb.0:
   1647 ; CHECK-NEXT:    kmovw %edi, %k1
   1648 ; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1649 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1650 ; CHECK-NEXT:    retq
   1651   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1652   %2 = bitcast i16 %mask to <16 x i1>
   1653   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1654   ret <16 x float> %3
   1655 }
   1656 
   1657 define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1658 ; CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae:
   1659 ; CHECK:       ## %bb.0:
   1660 ; CHECK-NEXT:    kmovw %edi, %k1
   1661 ; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1662 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1663 ; CHECK-NEXT:    retq
   1664   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1665   %2 = bitcast i16 %mask to <16 x i1>
   1666   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1667   ret <16 x float> %3
   1668 }
   1669 
   1670 define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1671 ; CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae:
   1672 ; CHECK:       ## %bb.0:
   1673 ; CHECK-NEXT:    kmovw %edi, %k1
   1674 ; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1675 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1676 ; CHECK-NEXT:    retq
   1677   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1678   %2 = bitcast i16 %mask to <16 x i1>
   1679   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1680   ret <16 x float> %3
   1681 }
   1682 
   1683 define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1684 ; CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae:
   1685 ; CHECK:       ## %bb.0:
   1686 ; CHECK-NEXT:    kmovw %edi, %k1
   1687 ; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1688 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1689 ; CHECK-NEXT:    retq
   1690   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1691   %2 = bitcast i16 %mask to <16 x i1>
   1692   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1693   ret <16 x float> %3
   1694 }
   1695 
   1696 define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1697 ; CHECK-LABEL: test_mm512_mask_div_round_ps_current:
   1698 ; CHECK:       ## %bb.0:
   1699 ; CHECK-NEXT:    kmovw %edi, %k1
   1700 ; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm2 {%k1}
   1701 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1702 ; CHECK-NEXT:    retq
   1703   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1704   %2 = bitcast i16 %mask to <16 x i1>
   1705   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1706   ret <16 x float> %3
   1707 }
   1708 
   1709 define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1710 ; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
   1711 ; CHECK:       ## %bb.0:
   1712 ; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
   1713 ; CHECK-NEXT:    retq
   1714   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0)
   1715   ret <16 x float> %1
   1716 }
   1717 
   1718 define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1719 ; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
   1720 ; CHECK:       ## %bb.0:
   1721 ; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
   1722 ; CHECK-NEXT:    retq
   1723   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1)
   1724   ret <16 x float> %1
   1725 }
   1726 
   1727 define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1728 ; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
   1729 ; CHECK:       ## %bb.0:
   1730 ; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
   1731 ; CHECK-NEXT:    retq
   1732   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2)
   1733   ret <16 x float> %1
   1734 }
   1735 
   1736 define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1737 ; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
   1738 ; CHECK:       ## %bb.0:
   1739 ; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
   1740 ; CHECK-NEXT:    retq
   1741   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3)
   1742   ret <16 x float> %1
   1743 }
   1744 
   1745 define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1746 ; CHECK-LABEL: test_mm512_div_round_ps_current:
   1747 ; CHECK:       ## %bb.0:
   1748 ; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0
   1749 ; CHECK-NEXT:    retq
   1750   %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1751   ret <16 x float> %1
   1752 }
   1753 declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
   1754 
   1755 define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1756 ; CHECK-LABEL: test_mm512_maskz_min_round_ps_sae:
   1757 ; CHECK:       ## %bb.0:
   1758 ; CHECK-NEXT:    kmovw %edi, %k1
   1759 ; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1760 ; CHECK-NEXT:    retq
   1761   %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
   1762   %2 = bitcast i16 %mask to <16 x i1>
   1763   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1764   ret <16 x float> %3
   1765 }
   1766 
   1767 define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1768 ; CHECK-LABEL: test_mm512_maskz_min_round_ps_current:
   1769 ; CHECK:       ## %bb.0:
   1770 ; CHECK-NEXT:    kmovw %edi, %k1
   1771 ; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
   1772 ; CHECK-NEXT:    retq
   1773   %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1774   %2 = bitcast i16 %mask to <16 x i1>
   1775   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1776   ret <16 x float> %3
   1777 }
   1778 
   1779 define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1780 ; CHECK-LABEL: test_mm512_mask_min_round_ps_sae:
   1781 ; CHECK:       ## %bb.0:
   1782 ; CHECK-NEXT:    kmovw %edi, %k1
   1783 ; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1784 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1785 ; CHECK-NEXT:    retq
   1786   %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
   1787   %2 = bitcast i16 %mask to <16 x i1>
   1788   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1789   ret <16 x float> %3
   1790 }
   1791 
   1792 define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1793 ; CHECK-LABEL: test_mm512_mask_min_round_ps_current:
   1794 ; CHECK:       ## %bb.0:
   1795 ; CHECK-NEXT:    kmovw %edi, %k1
   1796 ; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm2 {%k1}
   1797 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1798 ; CHECK-NEXT:    retq
   1799   %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1800   %2 = bitcast i16 %mask to <16 x i1>
   1801   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1802   ret <16 x float> %3
   1803 }
   1804 
   1805 define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1806 ; CHECK-LABEL: test_mm512_min_round_ps_sae:
   1807 ; CHECK:       ## %bb.0:
   1808 ; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0
   1809 ; CHECK-NEXT:    retq
   1810   %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
   1811   ret <16 x float> %1
   1812 }
   1813 
   1814 define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1815 ; CHECK-LABEL: test_mm512_min_round_ps_current:
   1816 ; CHECK:       ## %bb.0:
   1817 ; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
   1818 ; CHECK-NEXT:    retq
   1819   %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1820   ret <16 x float> %1
   1821 }
   1822 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
   1823 
   1824 define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1825 ; CHECK-LABEL: test_mm512_maskz_max_round_ps_sae:
   1826 ; CHECK:       ## %bb.0:
   1827 ; CHECK-NEXT:    kmovw %edi, %k1
   1828 ; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
   1829 ; CHECK-NEXT:    retq
   1830   %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
   1831   %2 = bitcast i16 %mask to <16 x i1>
   1832   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1833   ret <16 x float> %3
   1834 }
   1835 
   1836 define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1837 ; CHECK-LABEL: test_mm512_maskz_max_round_ps_current:
   1838 ; CHECK:       ## %bb.0:
   1839 ; CHECK-NEXT:    kmovw %edi, %k1
   1840 ; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
   1841 ; CHECK-NEXT:    retq
   1842   %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1843   %2 = bitcast i16 %mask to <16 x i1>
   1844   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   1845   ret <16 x float> %3
   1846 }
   1847 
   1848 define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1849 ; CHECK-LABEL: test_mm512_mask_max_round_ps_sae:
   1850 ; CHECK:       ## %bb.0:
   1851 ; CHECK-NEXT:    kmovw %edi, %k1
   1852 ; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
   1853 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1854 ; CHECK-NEXT:    retq
   1855   %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
   1856   %2 = bitcast i16 %mask to <16 x i1>
   1857   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1858   ret <16 x float> %3
   1859 }
   1860 
   1861 define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
   1862 ; CHECK-LABEL: test_mm512_mask_max_round_ps_current:
   1863 ; CHECK:       ## %bb.0:
   1864 ; CHECK-NEXT:    kmovw %edi, %k1
   1865 ; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm2 {%k1}
   1866 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1867 ; CHECK-NEXT:    retq
   1868   %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1869   %2 = bitcast i16 %mask to <16 x i1>
   1870   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
   1871   ret <16 x float> %3
   1872 }
   1873 
   1874 define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1875 ; CHECK-LABEL: test_mm512_max_round_ps_sae:
   1876 ; CHECK:       ## %bb.0:
   1877 ; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0
   1878 ; CHECK-NEXT:    retq
   1879   %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
   1880   ret <16 x float> %1
   1881 }
   1882 
   1883 define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
   1884 ; CHECK-LABEL: test_mm512_max_round_ps_current:
   1885 ; CHECK:       ## %bb.0:
   1886 ; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
   1887 ; CHECK-NEXT:    retq
   1888   %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
   1889   ret <16 x float> %1
   1890 }
   1891 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
   1892 
   1893 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
   1894 
   1895 define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   1896 ; CHECK-LABEL: test_mask_add_ss_rn:
   1897 ; CHECK:       ## %bb.0:
   1898 ; CHECK-NEXT:    kmovw %edi, %k1
   1899 ; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   1900 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
   1901 ; CHECK-NEXT:    retq
   1902   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 0)
   1903   ret <4 x float> %res
   1904 }
   1905 
   1906 define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   1907 ; CHECK-LABEL: test_mask_add_ss_rd:
   1908 ; CHECK:       ## %bb.0:
   1909 ; CHECK-NEXT:    kmovw %edi, %k1
   1910 ; CHECK-NEXT:    vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   1911 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
   1912 ; CHECK-NEXT:    retq
   1913   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
   1914   ret <4 x float> %res
   1915 }
   1916 
   1917 define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   1918 ; CHECK-LABEL: test_mask_add_ss_ru:
   1919 ; CHECK:       ## %bb.0:
   1920 ; CHECK-NEXT:    kmovw %edi, %k1
   1921 ; CHECK-NEXT:    vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   1922 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
   1923 ; CHECK-NEXT:    retq
   1924   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 2)
   1925   ret <4 x float> %res
   1926 }
   1927 
   1928 define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   1929 ; CHECK-LABEL: test_mask_add_ss_rz:
   1930 ; CHECK:       ## %bb.0:
   1931 ; CHECK-NEXT:    kmovw %edi, %k1
   1932 ; CHECK-NEXT:    vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   1933 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
   1934 ; CHECK-NEXT:    retq
   1935   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 3)
   1936   ret <4 x float> %res
   1937 }
   1938 
   1939 define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   1940 ; CHECK-LABEL: test_mask_add_ss_current:
   1941 ; CHECK:       ## %bb.0:
   1942 ; CHECK-NEXT:    kmovw %edi, %k1
   1943 ; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
   1944 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
   1945 ; CHECK-NEXT:    retq
   1946   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   1947   ret <4 x float> %res
   1948 }
   1949 
   1950 define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
   1951 ; CHECK-LABEL: test_maskz_add_ss_rn:
   1952 ; CHECK:       ## %bb.0:
   1953 ; CHECK-NEXT:    kmovw %edi, %k1
   1954 ; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   1955 ; CHECK-NEXT:    retq
   1956   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 0)
   1957   ret <4 x float> %res
   1958 }
   1959 
   1960 define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
   1961 ; CHECK-LABEL: test_add_ss_rn:
   1962 ; CHECK:       ## %bb.0:
   1963 ; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
   1964 ; CHECK-NEXT:    retq
   1965   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 0)
   1966   ret <4 x float> %res
   1967 }
   1968 
   1969 define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
   1970 ; CHECK-LABEL: test_mask_add_ss_current_memfold:
   1971 ; CHECK:       ## %bb.0:
   1972 ; CHECK-NEXT:    kmovw %esi, %k1
   1973 ; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm1 {%k1}
   1974 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
   1975 ; CHECK-NEXT:    retq
   1976   %a1.val = load float, float* %a1
   1977   %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
   1978   %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
   1979   %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
   1980   %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
   1981   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
   1982   ret <4 x float> %res
   1983 }
   1984 
   1985 define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
   1986 ; CHECK-LABEL: test_maskz_add_ss_current_memfold:
   1987 ; CHECK:       ## %bb.0:
   1988 ; CHECK-NEXT:    kmovw %esi, %k1
   1989 ; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
   1990 ; CHECK-NEXT:    retq
   1991   %a1.val = load float, float* %a1
   1992   %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
   1993   %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
   1994   %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
   1995   %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
   1996   %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
   1997   ret <4 x float> %res
   1998 }
   1999 
   2000 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
   2001 
   2002 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   2003 ; CHECK-LABEL: test_mask_add_sd_rn:
   2004 ; CHECK:       ## %bb.0:
   2005 ; CHECK-NEXT:    kmovw %edi, %k1
   2006 ; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   2007 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
   2008 ; CHECK-NEXT:    retq
   2009   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0)
   2010   ret <2 x double> %res
   2011 }
   2012 
   2013 define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   2014 ; CHECK-LABEL: test_mask_add_sd_rd:
   2015 ; CHECK:       ## %bb.0:
   2016 ; CHECK-NEXT:    kmovw %edi, %k1
   2017 ; CHECK-NEXT:    vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   2018 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
   2019 ; CHECK-NEXT:    retq
   2020   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
   2021   ret <2 x double> %res
   2022 }
   2023 
   2024 define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   2025 ; CHECK-LABEL: test_mask_add_sd_ru:
   2026 ; CHECK:       ## %bb.0:
   2027 ; CHECK-NEXT:    kmovw %edi, %k1
   2028 ; CHECK-NEXT:    vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   2029 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
   2030 ; CHECK-NEXT:    retq
   2031   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2)
   2032   ret <2 x double> %res
   2033 }
   2034 
   2035 define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   2036 ; CHECK-LABEL: test_mask_add_sd_rz:
   2037 ; CHECK:       ## %bb.0:
   2038 ; CHECK-NEXT:    kmovw %edi, %k1
   2039 ; CHECK-NEXT:    vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   2040 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
   2041 ; CHECK-NEXT:    retq
   2042   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3)
   2043   ret <2 x double> %res
   2044 }
   2045 
   2046 define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   2047 ; CHECK-LABEL: test_mask_add_sd_current:
   2048 ; CHECK:       ## %bb.0:
   2049 ; CHECK-NEXT:    kmovw %edi, %k1
   2050 ; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
   2051 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
   2052 ; CHECK-NEXT:    retq
   2053   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   2054   ret <2 x double> %res
   2055 }
   2056 
   2057 define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
   2058 ; CHECK-LABEL: test_maskz_add_sd_rn:
   2059 ; CHECK:       ## %bb.0:
   2060 ; CHECK-NEXT:    kmovw %edi, %k1
   2061 ; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   2062 ; CHECK-NEXT:    retq
   2063   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 0)
   2064   ret <2 x double> %res
   2065 }
   2066 
   2067 define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
   2068 ; CHECK-LABEL: test_add_sd_rn:
   2069 ; CHECK:       ## %bb.0:
   2070 ; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
   2071 ; CHECK-NEXT:    retq
   2072   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 0)
   2073   ret <2 x double> %res
   2074 }
   2075 
   2076 define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
   2077 ; CHECK-LABEL: test_mask_add_sd_current_memfold:
   2078 ; CHECK:       ## %bb.0:
   2079 ; CHECK-NEXT:    kmovw %esi, %k1
   2080 ; CHECK-NEXT:    vaddsd (%rdi), %xmm0, %xmm1 {%k1}
   2081 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
   2082 ; CHECK-NEXT:    retq
   2083   %a1.val = load double, double* %a1
   2084   %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
   2085   %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
   2086   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
   2087   ret <2 x double> %res
   2088 }
   2089 
   2090 define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
   2091 ; CHECK-LABEL: test_maskz_add_sd_current_memfold:
   2092 ; CHECK:       ## %bb.0:
   2093 ; CHECK-NEXT:    kmovw %esi, %k1
   2094 ; CHECK-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z}
   2095 ; CHECK-NEXT:    retq
   2096   %a1.val = load double, double* %a1
   2097   %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
   2098   %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
   2099   %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
   2100   ret <2 x double> %res
   2101 }
   2102 
   2103 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
   2104 
   2105 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   2106 ; CHECK-LABEL: test_mask_max_ss_sae:
   2107 ; CHECK:       ## %bb.0:
   2108 ; CHECK-NEXT:    kmovw %edi, %k1
   2109 ; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   2110 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
   2111 ; CHECK-NEXT:    retq
   2112   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
   2113   ret <4 x float> %res
   2114 }
   2115 
   2116 define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
   2117 ; CHECK-LABEL: test_maskz_max_ss_sae:
   2118 ; CHECK:       ## %bb.0:
   2119 ; CHECK-NEXT:    kmovw %edi, %k1
   2120 ; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   2121 ; CHECK-NEXT:    retq
   2122   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
   2123   ret <4 x float> %res
   2124 }
   2125 
   2126 define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
   2127 ; CHECK-LABEL: test_max_ss_sae:
   2128 ; CHECK:       ## %bb.0:
   2129 ; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0
   2130 ; CHECK-NEXT:    retq
   2131   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
   2132   ret <4 x float> %res
   2133 }
   2134 
   2135 define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   2136 ; CHECK-LABEL: test_mask_max_ss:
   2137 ; CHECK:       ## %bb.0:
   2138 ; CHECK-NEXT:    kmovw %edi, %k1
   2139 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm2 {%k1}
   2140 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
   2141 ; CHECK-NEXT:    retq
   2142   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   2143   ret <4 x float> %res
   2144 }
   2145 
   2146 define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
   2147 ; CHECK-LABEL: test_maskz_max_ss:
   2148 ; CHECK:       ## %bb.0:
   2149 ; CHECK-NEXT:    kmovw %edi, %k1
   2150 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2151 ; CHECK-NEXT:    retq
   2152   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
   2153   ret <4 x float> %res
   2154 }
   2155 
   2156 define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
   2157 ; CHECK-LABEL: test_max_ss:
   2158 ; CHECK:       ## %bb.0:
   2159 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
   2160 ; CHECK-NEXT:    retq
   2161   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
   2162   ret <4 x float> %res
   2163 }
   2164 
   2165 define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
   2166 ; CHECK-LABEL: test_mask_max_ss_memfold:
   2167 ; CHECK:       ## %bb.0:
   2168 ; CHECK-NEXT:    kmovw %esi, %k1
   2169 ; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm1 {%k1}
   2170 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
   2171 ; CHECK-NEXT:    retq
   2172   %a1.val = load float, float* %a1
   2173   %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
   2174   %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
   2175   %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
   2176   %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
   2177   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
   2178   ret <4 x float> %res
   2179 }
   2180 
   2181 define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
   2182 ; CHECK-LABEL: test_maskz_max_ss_memfold:
   2183 ; CHECK:       ## %bb.0:
   2184 ; CHECK-NEXT:    kmovw %esi, %k1
   2185 ; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
   2186 ; CHECK-NEXT:    retq
   2187   %a1.val = load float, float* %a1
   2188   %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
   2189   %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
   2190   %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
   2191   %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
   2192   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
   2193   ret <4 x float> %res
   2194 }
   2195 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
   2196 
   2197 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   2198 ; CHECK-LABEL: test_mask_max_sd_sae:
   2199 ; CHECK:       ## %bb.0:
   2200 ; CHECK-NEXT:    kmovw %edi, %k1
   2201 ; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   2202 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
   2203 ; CHECK-NEXT:    retq
   2204   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
   2205   ret <2 x double> %res
   2206 }
   2207 
   2208 define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
   2209 ; CHECK-LABEL: test_maskz_max_sd_sae:
   2210 ; CHECK:       ## %bb.0:
   2211 ; CHECK-NEXT:    kmovw %edi, %k1
   2212 ; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
   2213 ; CHECK-NEXT:    retq
   2214   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
   2215   ret <2 x double> %res
   2216 }
   2217 
   2218 define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
   2219 ; CHECK-LABEL: test_max_sd_sae:
   2220 ; CHECK:       ## %bb.0:
   2221 ; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0
   2222 ; CHECK-NEXT:    retq
   2223   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
   2224   ret <2 x double> %res
   2225 }
   2226 
   2227 define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   2228 ; CHECK-LABEL: test_mask_max_sd:
   2229 ; CHECK:       ## %bb.0:
   2230 ; CHECK-NEXT:    kmovw %edi, %k1
   2231 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
   2232 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
   2233 ; CHECK-NEXT:    retq
   2234   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   2235   ret <2 x double> %res
   2236 }
   2237 
   2238 define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
   2239 ; CHECK-LABEL: test_maskz_max_sd:
   2240 ; CHECK:       ## %bb.0:
   2241 ; CHECK-NEXT:    kmovw %edi, %k1
   2242 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2243 ; CHECK-NEXT:    retq
   2244   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
   2245   ret <2 x double> %res
   2246 }
   2247 
   2248 define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
   2249 ; CHECK-LABEL: test_max_sd:
   2250 ; CHECK:       ## %bb.0:
   2251 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
   2252 ; CHECK-NEXT:    retq
   2253   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
   2254   ret <2 x double> %res
   2255 }
   2256 
   2257 define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
   2258 ; CHECK-LABEL: test_mask_max_sd_memfold:
   2259 ; CHECK:       ## %bb.0:
   2260 ; CHECK-NEXT:    kmovw %esi, %k1
   2261 ; CHECK-NEXT:    vmaxsd (%rdi), %xmm0, %xmm1 {%k1}
   2262 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
   2263 ; CHECK-NEXT:    retq
   2264   %a1.val = load double, double* %a1
   2265   %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
   2266   %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
   2267   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
   2268   ret <2 x double> %res
   2269 }
   2270 
   2271 define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
   2272 ; CHECK-LABEL: test_maskz_max_sd_memfold:
   2273 ; CHECK:       ## %bb.0:
   2274 ; CHECK-NEXT:    kmovw %esi, %k1
   2275 ; CHECK-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z}
   2276 ; CHECK-NEXT:    retq
   2277   %a1.val = load double, double* %a1
   2278   %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
   2279   %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
   2280   %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
   2281   ret <2 x double> %res
   2282 }
   2283 
   2284 define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) {
   2285 ; CHECK-LABEL: test_x86_avx512_cvtsi2sd64:
   2286 ; CHECK:       ## %bb.0:
   2287 ; CHECK-NEXT:    vcvtsi2sdq %rdi, {rz-sae}, %xmm0, %xmm0
   2288 ; CHECK-NEXT:    retq
   2289   %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double> %a, i64 %b, i32 3) ; <<<2 x double>> [#uses=1]
   2290   ret <2 x double> %res
   2291 }
   2292 declare <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double>, i64, i32) nounwind readnone
   2293 
   2294 define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
   2295 ; CHECK-LABEL: test_x86_avx512_cvtsi2ss32:
   2296 ; CHECK:       ## %bb.0:
   2297 ; CHECK-NEXT:    vcvtsi2ssl %edi, {rz-sae}, %xmm0, %xmm0
   2298 ; CHECK-NEXT:    retq
   2299   %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 3) ; <<<4 x float>> [#uses=1]
   2300   ret <4 x float> %res
   2301 }
   2302 declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
   2303 
   2304 define <4 x float> @test_x86_avx512_cvtsi2ss64(<4 x float> %a, i64 %b) {
   2305 ; CHECK-LABEL: test_x86_avx512_cvtsi2ss64:
   2306 ; CHECK:       ## %bb.0:
   2307 ; CHECK-NEXT:    vcvtsi2ssq %rdi, {rz-sae}, %xmm0, %xmm0
   2308 ; CHECK-NEXT:    retq
   2309   %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float> %a, i64 %b, i32 3) ; <<<4 x float>> [#uses=1]
   2310   ret <4 x float> %res
   2311 }
   2312 declare <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float>, i64, i32) nounwind readnone
   2313 
   2314 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
   2315 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
   2316 ; CHECK:       ## %bb.0:
   2317 ; CHECK-NEXT:    vcvtusi2ssl %edi, {rd-sae}, %xmm0, %xmm0
   2318 ; CHECK-NEXT:    retq
   2319 {
   2320   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
   2321   ret <4 x float> %res
   2322 }
   2323 
   2324 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr)
   2325 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
   2326 ; CHECK:       ## %bb.0:
   2327 ; CHECK-NEXT:    movl (%rdi), %eax
   2328 ; CHECK-NEXT:    vcvtusi2ssl %eax, {rd-sae}, %xmm0, %xmm0
   2329 ; CHECK-NEXT:    retq
   2330 {
   2331   %b = load i32, i32* %ptr
   2332   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
   2333   ret <4 x float> %res
   2334 }
   2335 
   2336 define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b)
   2337 ; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss:
   2338 ; CHECK:       ## %bb.0:
   2339 ; CHECK-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
   2340 ; CHECK-NEXT:    retq
   2341 {
   2342   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
   2343   ret <4 x float> %res
   2344 }
   2345 
   2346 define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr)
   2347 ; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
   2348 ; CHECK:       ## %bb.0:
   2349 ; CHECK-NEXT:    vcvtusi2ssl (%rdi), %xmm0, %xmm0
   2350 ; CHECK-NEXT:    retq
   2351 {
   2352   %b = load i32, i32* %ptr
   2353   %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
   2354   ret <4 x float> %res
   2355 }
   2356 declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
   2357 
   2358 define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b)
   2359 ; CHECK-LABEL: _mm_cvt_roundu64_ss:
   2360 ; CHECK:       ## %bb.0:
   2361 ; CHECK-NEXT:    vcvtusi2ssq %rdi, {rd-sae}, %xmm0, %xmm0
   2362 ; CHECK-NEXT:    retq
   2363 {
   2364   %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 1) ; <<<4 x float>> [#uses=1]
   2365   ret <4 x float> %res
   2366 }
   2367 
   2368 define <4 x float> @_mm_cvtu64_ss(<4 x float> %a, i64 %b)
   2369 ; CHECK-LABEL: _mm_cvtu64_ss:
   2370 ; CHECK:       ## %bb.0:
   2371 ; CHECK-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0
   2372 ; CHECK-NEXT:    retq
   2373 {
   2374   %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 4) ; <<<4 x float>> [#uses=1]
   2375   ret <4 x float> %res
   2376 }
   2377 declare <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float>, i64, i32) nounwind readnone
   2378 
   2379 define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b)
   2380 ; CHECK-LABEL: test_x86_avx512_mm_cvtu64_sd:
   2381 ; CHECK:       ## %bb.0:
   2382 ; CHECK-NEXT:    vcvtusi2sdq %rdi, {rd-sae}, %xmm0, %xmm0
   2383 ; CHECK-NEXT:    retq
   2384 {
   2385   %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 1) ; <<<2 x double>> [#uses=1]
   2386   ret <2 x double> %res
   2387 }
   2388 
   2389 define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b)
   2390 ; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu64_sd:
   2391 ; CHECK:       ## %bb.0:
   2392 ; CHECK-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0
   2393 ; CHECK-NEXT:    retq
   2394 {
   2395   %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 4) ; <<<2 x double>> [#uses=1]
   2396   ret <2 x double> %res
   2397 }
   2398 declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64, i32) nounwind readnone
   2399 
   2400 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
   2401 
   2402 define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
   2403 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
   2404 ; CHECK:       ## %bb.0:
   2405 ; CHECK-NEXT:    kmovw %esi, %k1
   2406 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
   2407 ; CHECK-NEXT:    vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
   2408 ; CHECK-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0
   2409 ; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
   2410 ; CHECK-NEXT:    retq
   2411   %x2 = load <16 x i32>, <16 x i32>* %x2p
   2412   %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   2413   %2 = bitcast i16 %x3 to <16 x i1>
   2414   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
   2415   %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
   2416   %res2 = add <16 x i32> %3, %4
   2417   ret <16 x i32> %res2
   2418 }
   2419 
   2420 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
   2421 
   2422 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
   2423 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
   2424 ; CHECK:       ## %bb.0:
   2425 ; CHECK-NEXT:    vmovapd %zmm0, %zmm3
   2426 ; CHECK-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm3
   2427 ; CHECK-NEXT:    kmovw %edi, %k1
   2428 ; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
   2429 ; CHECK-NEXT:    vaddpd %zmm3, %zmm1, %zmm0
   2430 ; CHECK-NEXT:    retq
   2431   %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
   2432   %2 = bitcast <8 x i64> %x1 to <8 x double>
   2433   %3 = bitcast i8 %x3 to <8 x i1>
   2434   %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2
   2435   %5 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
   2436   %6 = bitcast <8 x i64> %x1 to <8 x double>
   2437   %res2 = fadd <8 x double> %4, %5
   2438   ret <8 x double> %res2
   2439 }
   2440 
   2441 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
   2442 
   2443 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
   2444 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
   2445 ; CHECK:       ## %bb.0:
   2446 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   2447 ; CHECK-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm3
   2448 ; CHECK-NEXT:    kmovw %edi, %k1
   2449 ; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
   2450 ; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
   2451 ; CHECK-NEXT:    retq
   2452   %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
   2453   %2 = bitcast <16 x i32> %x1 to <16 x float>
   2454   %3 = bitcast i16 %x3 to <16 x i1>
   2455   %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
   2456   %5 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
   2457   %6 = bitcast <16 x i32> %x1 to <16 x float>
   2458   %res2 = fadd <16 x float> %4, %5
   2459   ret <16 x float> %res2
   2460 }
   2461 
   2462 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
   2463 
   2464 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   2465 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
   2466 ; CHECK:       ## %bb.0:
   2467 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
   2468 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3
   2469 ; CHECK-NEXT:    kmovw %edi, %k1
   2470 ; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
   2471 ; CHECK-NEXT:    vpaddq %zmm3, %zmm1, %zmm0
   2472 ; CHECK-NEXT:    retq
   2473   %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
   2474   %2 = bitcast i8 %x3 to <8 x i1>
   2475   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
   2476   %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
   2477   %res2 = add <8 x i64> %3, %4
   2478   ret <8 x i64> %res2
   2479 }
   2480 
   2481 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
   2482 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
   2483 ; CHECK:       ## %bb.0:
   2484 ; CHECK-NEXT:    kmovw %esi, %k1
   2485 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm2
   2486 ; CHECK-NEXT:    vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
   2487 ; CHECK-NEXT:    vpermt2d %zmm1, %zmm0, %zmm1
   2488 ; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm0
   2489 ; CHECK-NEXT:    retq
   2490   %x2 = load <16 x i32>, <16 x i32>* %x2p
   2491   %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
   2492   %2 = bitcast i16 %x3 to <16 x i1>
   2493   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
   2494   %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x1)
   2495   %res2 = add <16 x i32> %3, %4
   2496   ret <16 x i32> %res2
   2497 }
   2498 
   2499 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
   2500 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
   2501 ; CHECK:       ## %bb.0:
   2502 ; CHECK-NEXT:    kmovw %esi, %k1
   2503 ; CHECK-NEXT:    vmovapd %zmm1, %zmm2
   2504 ; CHECK-NEXT:    vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
   2505 ; CHECK-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1
   2506 ; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm0
   2507 ; CHECK-NEXT:    retq
   2508   %x2s = load double, double* %x2ptr
   2509   %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
   2510   %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
   2511   %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
   2512   %2 = bitcast i8 %x3 to <8 x i1>
   2513   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
   2514   %4 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x1)
   2515   %res2 = fadd <8 x double> %3, %4
   2516   ret <8 x double> %res2
   2517 }
   2518 
   2519 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
   2520 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
   2521 ; CHECK:       ## %bb.0:
   2522 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
   2523 ; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3
   2524 ; CHECK-NEXT:    kmovw %edi, %k1
   2525 ; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
   2526 ; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
   2527 ; CHECK-NEXT:    retq
   2528   %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
   2529   %2 = bitcast i16 %x3 to <16 x i1>
   2530   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   2531   %4 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
   2532   %res2 = fadd <16 x float> %3, %4
   2533   ret <16 x float> %res2
   2534 }
   2535 
   2536 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   2537 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
   2538 ; CHECK:       ## %bb.0:
   2539 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
   2540 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3
   2541 ; CHECK-NEXT:    kmovw %edi, %k1
   2542 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
   2543 ; CHECK-NEXT:    vpaddq %zmm3, %zmm1, %zmm0
   2544 ; CHECK-NEXT:    retq
   2545   %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
   2546   %2 = bitcast i8 %x3 to <8 x i1>
   2547   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
   2548   %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
   2549   %res2 = add <8 x i64> %3, %4
   2550   ret <8 x i64> %res2
   2551 }
   2552 
   2553 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   2554 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
   2555 ; CHECK:       ## %bb.0:
   2556 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
   2557 ; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
   2558 ; CHECK-NEXT:    kmovw %edi, %k1
   2559 ; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
   2560 ; CHECK-NEXT:    vpaddd %zmm3, %zmm1, %zmm0
   2561 ; CHECK-NEXT:    retq
   2562   %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
   2563   %2 = bitcast i16 %x3 to <16 x i1>
   2564   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
   2565   %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
   2566   %res2 = add <16 x i32> %3, %4
   2567   ret <16 x i32> %res2
   2568 }
   2569 
   2570 declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
   2571 define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
   2572 ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
   2573 ; CHECK:       ## %bb.0:
   2574 ; CHECK-NEXT:    kmovw %edi, %k1
   2575 ; CHECK-NEXT:    vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   2576 ; CHECK-NEXT:    vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
   2577 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   2578 ; CHECK-NEXT:    retq
   2579   %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 3)
   2580   %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
   2581   %res2 = fadd <8 x double> %res, %res1
   2582   ret <8 x double> %res2
   2583 }
   2584 
   2585 declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
   2586 define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
   2587 ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
   2588 ; CHECK:       ## %bb.0:
   2589 ; CHECK-NEXT:    kmovw %edi, %k1
   2590 ; CHECK-NEXT:    vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   2591 ; CHECK-NEXT:    vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
   2592 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   2593 ; CHECK-NEXT:    retq
   2594   %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 2)
   2595   %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
   2596   %res2 = fadd <16 x float> %res, %res1
   2597   ret <16 x float> %res2
   2598 }
   2599 
   2600 declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
   2601 
   2602 define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
   2603 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
   2604 ; CHECK:       ## %bb.0:
   2605 ; CHECK-NEXT:    kmovw %edi, %k1
   2606 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm2 {%k1} {z}
   2607 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
   2608 ; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
   2609 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm0
   2610 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   2611 ; CHECK-NEXT:    vzeroupper
   2612 ; CHECK-NEXT:    retq
   2613     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
   2614     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
   2615     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
   2616     %res3 = add <16 x i8> %res0, %res1
   2617     %res4 = add <16 x i8> %res3, %res2
   2618     ret <16 x i8> %res4
   2619 }
   2620 
   2621 declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
   2622 
   2623 define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2624 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
   2625 ; CHECK:       ## %bb.0:
   2626 ; CHECK-NEXT:    kmovw %esi, %k1
   2627 ; CHECK-NEXT:    vpmovqb %zmm0, (%rdi)
   2628 ; CHECK-NEXT:    vpmovqb %zmm0, (%rdi) {%k1}
   2629 ; CHECK-NEXT:    vzeroupper
   2630 ; CHECK-NEXT:    retq
   2631     call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2632     call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2633     ret void
   2634 }
   2635 
   2636 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
   2637 
   2638 define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
   2639 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
   2640 ; CHECK:       ## %bb.0:
   2641 ; CHECK-NEXT:    kmovw %edi, %k1
   2642 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm2 {%k1} {z}
   2643 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
   2644 ; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
   2645 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm0
   2646 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   2647 ; CHECK-NEXT:    vzeroupper
   2648 ; CHECK-NEXT:    retq
   2649     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
   2650     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
   2651     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
   2652     %res3 = add <16 x i8> %res0, %res1
   2653     %res4 = add <16 x i8> %res3, %res2
   2654     ret <16 x i8> %res4
   2655 }
   2656 
   2657 declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8)
   2658 
   2659 define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2660 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
   2661 ; CHECK:       ## %bb.0:
   2662 ; CHECK-NEXT:    kmovw %esi, %k1
   2663 ; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi)
   2664 ; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi) {%k1}
   2665 ; CHECK-NEXT:    vzeroupper
   2666 ; CHECK-NEXT:    retq
   2667     call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2668     call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2669     ret void
   2670 }
   2671 
   2672 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
   2673 
   2674 define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
   2675 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
   2676 ; CHECK:       ## %bb.0:
   2677 ; CHECK-NEXT:    kmovw %edi, %k1
   2678 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm2 {%k1} {z}
   2679 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
   2680 ; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
   2681 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm0
   2682 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   2683 ; CHECK-NEXT:    vzeroupper
   2684 ; CHECK-NEXT:    retq
   2685     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
   2686     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
   2687     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
   2688     %res3 = add <16 x i8> %res0, %res1
   2689     %res4 = add <16 x i8> %res3, %res2
   2690     ret <16 x i8> %res4
   2691 }
   2692 
   2693 declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8)
   2694 
   2695 define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2696 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
   2697 ; CHECK:       ## %bb.0:
   2698 ; CHECK-NEXT:    kmovw %esi, %k1
   2699 ; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi)
   2700 ; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi) {%k1}
   2701 ; CHECK-NEXT:    vzeroupper
   2702 ; CHECK-NEXT:    retq
   2703     call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2704     call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2705     ret void
   2706 }
   2707 
   2708 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
   2709 
   2710 define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
   2711 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
   2712 ; CHECK:       ## %bb.0:
   2713 ; CHECK-NEXT:    kmovw %edi, %k1
   2714 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm2 {%k1} {z}
   2715 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
   2716 ; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
   2717 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
   2718 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   2719 ; CHECK-NEXT:    vzeroupper
   2720 ; CHECK-NEXT:    retq
   2721     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
   2722     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
   2723     %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
   2724     %res3 = add <8 x i16> %res0, %res1
   2725     %res4 = add <8 x i16> %res3, %res2
   2726     ret <8 x i16> %res4
   2727 }
   2728 
   2729 declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
   2730 
   2731 define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2732 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
   2733 ; CHECK:       ## %bb.0:
   2734 ; CHECK-NEXT:    kmovw %esi, %k1
   2735 ; CHECK-NEXT:    vpmovqw %zmm0, (%rdi)
   2736 ; CHECK-NEXT:    vpmovqw %zmm0, (%rdi) {%k1}
   2737 ; CHECK-NEXT:    vzeroupper
   2738 ; CHECK-NEXT:    retq
   2739     call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2740     call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2741     ret void
   2742 }
   2743 
   2744 declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
   2745 
   2746 define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
   2747 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
   2748 ; CHECK:       ## %bb.0:
   2749 ; CHECK-NEXT:    kmovw %edi, %k1
   2750 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm2 {%k1} {z}
   2751 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
   2752 ; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
   2753 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm0
   2754 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   2755 ; CHECK-NEXT:    vzeroupper
   2756 ; CHECK-NEXT:    retq
   2757     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
   2758     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
   2759     %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
   2760     %res3 = add <8 x i16> %res0, %res1
   2761     %res4 = add <8 x i16> %res3, %res2
   2762     ret <8 x i16> %res4
   2763 }
   2764 
   2765 declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8)
   2766 
   2767 define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2768 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
   2769 ; CHECK:       ## %bb.0:
   2770 ; CHECK-NEXT:    kmovw %esi, %k1
   2771 ; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi)
   2772 ; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi) {%k1}
   2773 ; CHECK-NEXT:    vzeroupper
   2774 ; CHECK-NEXT:    retq
   2775     call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2776     call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2777     ret void
   2778 }
   2779 
   2780 declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
   2781 
   2782 define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
   2783 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
   2784 ; CHECK:       ## %bb.0:
   2785 ; CHECK-NEXT:    kmovw %edi, %k1
   2786 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm2 {%k1} {z}
   2787 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
   2788 ; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
   2789 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm0
   2790 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   2791 ; CHECK-NEXT:    vzeroupper
   2792 ; CHECK-NEXT:    retq
   2793     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
   2794     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
   2795     %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
   2796     %res3 = add <8 x i16> %res0, %res1
   2797     %res4 = add <8 x i16> %res3, %res2
   2798     ret <8 x i16> %res4
   2799 }
   2800 
   2801 declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8)
   2802 
   2803 define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2804 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
   2805 ; CHECK:       ## %bb.0:
   2806 ; CHECK-NEXT:    kmovw %esi, %k1
   2807 ; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi)
   2808 ; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi) {%k1}
   2809 ; CHECK-NEXT:    vzeroupper
   2810 ; CHECK-NEXT:    retq
   2811     call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2812     call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2813     ret void
   2814 }
   2815 
   2816 declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
   2817 
   2818 define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
   2819 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
   2820 ; CHECK:       ## %bb.0:
   2821 ; CHECK-NEXT:    kmovw %edi, %k1
   2822 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm2 {%k1} {z}
   2823 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
   2824 ; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
   2825 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
   2826 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   2827 ; CHECK-NEXT:    retq
   2828     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
   2829     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
   2830     %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
   2831     %res3 = add <8 x i32> %res0, %res1
   2832     %res4 = add <8 x i32> %res3, %res2
   2833     ret <8 x i32> %res4
   2834 }
   2835 
   2836 declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
   2837 
   2838 define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2839 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
   2840 ; CHECK:       ## %bb.0:
   2841 ; CHECK-NEXT:    kmovw %esi, %k1
   2842 ; CHECK-NEXT:    vpmovqd %zmm0, (%rdi)
   2843 ; CHECK-NEXT:    vpmovqd %zmm0, (%rdi) {%k1}
   2844 ; CHECK-NEXT:    vzeroupper
   2845 ; CHECK-NEXT:    retq
   2846     call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2847     call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2848     ret void
   2849 }
   2850 
   2851 declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
   2852 
   2853 define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
   2854 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
   2855 ; CHECK:       ## %bb.0:
   2856 ; CHECK-NEXT:    kmovw %edi, %k1
   2857 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm2 {%k1} {z}
   2858 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
   2859 ; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
   2860 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm0
   2861 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   2862 ; CHECK-NEXT:    retq
   2863     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
   2864     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
   2865     %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
   2866     %res3 = add <8 x i32> %res0, %res1
   2867     %res4 = add <8 x i32> %res3, %res2
   2868     ret <8 x i32> %res4
   2869 }
   2870 
   2871 declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8)
   2872 
   2873 define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2874 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
   2875 ; CHECK:       ## %bb.0:
   2876 ; CHECK-NEXT:    kmovw %esi, %k1
   2877 ; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi)
   2878 ; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi) {%k1}
   2879 ; CHECK-NEXT:    vzeroupper
   2880 ; CHECK-NEXT:    retq
   2881     call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2882     call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2883     ret void
   2884 }
   2885 
   2886 declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
   2887 
   2888 define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
   2889 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
   2890 ; CHECK:       ## %bb.0:
   2891 ; CHECK-NEXT:    kmovw %edi, %k1
   2892 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm2 {%k1} {z}
   2893 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
   2894 ; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
   2895 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm0
   2896 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   2897 ; CHECK-NEXT:    retq
   2898     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
   2899     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
   2900     %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
   2901     %res3 = add <8 x i32> %res0, %res1
   2902     %res4 = add <8 x i32> %res3, %res2
   2903     ret <8 x i32> %res4
   2904 }
   2905 
   2906 declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8)
   2907 
   2908 define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
   2909 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
   2910 ; CHECK:       ## %bb.0:
   2911 ; CHECK-NEXT:    kmovw %esi, %k1
   2912 ; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi)
   2913 ; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi) {%k1}
   2914 ; CHECK-NEXT:    vzeroupper
   2915 ; CHECK-NEXT:    retq
   2916     call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
   2917     call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
   2918     ret void
   2919 }
   2920 
   2921 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
   2922 
   2923 define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
   2924 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512:
   2925 ; CHECK:       ## %bb.0:
   2926 ; CHECK-NEXT:    kmovw %edi, %k1
   2927 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm2 {%k1} {z}
   2928 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
   2929 ; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
   2930 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
   2931 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   2932 ; CHECK-NEXT:    vzeroupper
   2933 ; CHECK-NEXT:    retq
   2934     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
   2935     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
   2936     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
   2937     %res3 = add <16 x i8> %res0, %res1
   2938     %res4 = add <16 x i8> %res3, %res2
   2939     ret <16 x i8> %res4
   2940 }
   2941 
   2942 declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16)
   2943 
   2944 define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   2945 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
   2946 ; CHECK:       ## %bb.0:
   2947 ; CHECK-NEXT:    kmovw %esi, %k1
   2948 ; CHECK-NEXT:    vpmovdb %zmm0, (%rdi)
   2949 ; CHECK-NEXT:    vpmovdb %zmm0, (%rdi) {%k1}
   2950 ; CHECK-NEXT:    vzeroupper
   2951 ; CHECK-NEXT:    retq
   2952     call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   2953     call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   2954     ret void
   2955 }
   2956 
   2957 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
   2958 
   2959 define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
   2960 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
   2961 ; CHECK:       ## %bb.0:
   2962 ; CHECK-NEXT:    kmovw %edi, %k1
   2963 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm2 {%k1} {z}
   2964 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
   2965 ; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
   2966 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm0
   2967 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   2968 ; CHECK-NEXT:    vzeroupper
   2969 ; CHECK-NEXT:    retq
   2970     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
   2971     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
   2972     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
   2973     %res3 = add <16 x i8> %res0, %res1
   2974     %res4 = add <16 x i8> %res3, %res2
   2975     ret <16 x i8> %res4
   2976 }
   2977 
   2978 declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16)
   2979 
   2980 define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   2981 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
   2982 ; CHECK:       ## %bb.0:
   2983 ; CHECK-NEXT:    kmovw %esi, %k1
   2984 ; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi)
   2985 ; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi) {%k1}
   2986 ; CHECK-NEXT:    vzeroupper
   2987 ; CHECK-NEXT:    retq
   2988     call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   2989     call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   2990     ret void
   2991 }
   2992 
   2993 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
   2994 
   2995 define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
   2996 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
   2997 ; CHECK:       ## %bb.0:
   2998 ; CHECK-NEXT:    kmovw %edi, %k1
   2999 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm2 {%k1} {z}
   3000 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
   3001 ; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
   3002 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm0
   3003 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   3004 ; CHECK-NEXT:    vzeroupper
   3005 ; CHECK-NEXT:    retq
   3006     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
   3007     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
   3008     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
   3009     %res3 = add <16 x i8> %res0, %res1
   3010     %res4 = add <16 x i8> %res3, %res2
   3011     ret <16 x i8> %res4
   3012 }
   3013 
   3014 declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16)
   3015 
   3016 define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   3017 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
   3018 ; CHECK:       ## %bb.0:
   3019 ; CHECK-NEXT:    kmovw %esi, %k1
   3020 ; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi)
   3021 ; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi) {%k1}
   3022 ; CHECK-NEXT:    vzeroupper
   3023 ; CHECK-NEXT:    retq
   3024     call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   3025     call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   3026     ret void
   3027 }
   3028 
   3029 declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
   3030 
   3031 define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
   3032 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
   3033 ; CHECK:       ## %bb.0:
   3034 ; CHECK-NEXT:    kmovw %edi, %k1
   3035 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm2 {%k1} {z}
   3036 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
   3037 ; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
   3038 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
   3039 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
   3040 ; CHECK-NEXT:    retq
   3041     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
   3042     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
   3043     %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
   3044     %res3 = add <16 x i16> %res0, %res1
   3045     %res4 = add <16 x i16> %res3, %res2
   3046     ret <16 x i16> %res4
   3047 }
   3048 
   3049 declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16)
   3050 
   3051 define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   3052 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
   3053 ; CHECK:       ## %bb.0:
   3054 ; CHECK-NEXT:    kmovw %esi, %k1
   3055 ; CHECK-NEXT:    vpmovdw %zmm0, (%rdi)
   3056 ; CHECK-NEXT:    vpmovdw %zmm0, (%rdi) {%k1}
   3057 ; CHECK-NEXT:    vzeroupper
   3058 ; CHECK-NEXT:    retq
   3059     call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   3060     call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   3061     ret void
   3062 }
   3063 
   3064 declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
   3065 
   3066 define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
   3067 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
   3068 ; CHECK:       ## %bb.0:
   3069 ; CHECK-NEXT:    kmovw %edi, %k1
   3070 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm2 {%k1} {z}
   3071 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
   3072 ; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
   3073 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm0
   3074 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
   3075 ; CHECK-NEXT:    retq
   3076     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
   3077     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
   3078     %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
   3079     %res3 = add <16 x i16> %res0, %res1
   3080     %res4 = add <16 x i16> %res3, %res2
   3081     ret <16 x i16> %res4
   3082 }
   3083 
   3084 declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16)
   3085 
   3086 define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   3087 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
   3088 ; CHECK:       ## %bb.0:
   3089 ; CHECK-NEXT:    kmovw %esi, %k1
   3090 ; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi)
   3091 ; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi) {%k1}
   3092 ; CHECK-NEXT:    vzeroupper
   3093 ; CHECK-NEXT:    retq
   3094     call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   3095     call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   3096     ret void
   3097 }
   3098 
   3099 declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
   3100 
   3101 define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
   3102 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
   3103 ; CHECK:       ## %bb.0:
   3104 ; CHECK-NEXT:    kmovw %edi, %k1
   3105 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm2 {%k1} {z}
   3106 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
   3107 ; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
   3108 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm0
   3109 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
   3110 ; CHECK-NEXT:    retq
   3111     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
   3112     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
   3113     %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
   3114     %res3 = add <16 x i16> %res0, %res1
   3115     %res4 = add <16 x i16> %res3, %res2
   3116     ret <16 x i16> %res4
   3117 }
   3118 
   3119 declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16)
   3120 
   3121 define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
   3122 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
   3123 ; CHECK:       ## %bb.0:
   3124 ; CHECK-NEXT:    kmovw %esi, %k1
   3125 ; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi)
   3126 ; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi) {%k1}
   3127 ; CHECK-NEXT:    vzeroupper
   3128 ; CHECK-NEXT:    retq
   3129     call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
   3130     call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
   3131     ret void
   3132 }
   3133 
   3134 declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
   3135 
   3136 define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
   3137 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
   3138 ; CHECK:       ## %bb.0:
   3139 ; CHECK-NEXT:    kmovw %edi, %k1
   3140 ; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm1 {%k1}
   3141 ; CHECK-NEXT:    vcvtdq2ps {rn-sae}, %zmm0, %zmm0
   3142 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   3143 ; CHECK-NEXT:    retq
   3144   %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
   3145   %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
   3146   %res2 = fadd <16 x float> %res, %res1
   3147   ret <16 x float> %res2
   3148 }
   3149 
   3150 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
   3151 
   3152 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
   3153 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
   3154 ; CHECK:       ## %bb.0:
   3155 ; CHECK-NEXT:    kmovw %edi, %k1
   3156 ; CHECK-NEXT:    vcvtpd2dq %zmm0, %ymm1 {%k1}
   3157 ; CHECK-NEXT:    vcvtpd2dq {rn-sae}, %zmm0, %ymm0
   3158 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   3159 ; CHECK-NEXT:    retq
   3160   %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
   3161   %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
   3162   %res2 = add <8 x i32> %res, %res1
   3163   ret <8 x i32> %res2
   3164 }
   3165 
   3166 declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
   3167 
   3168 define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
   3169 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
   3170 ; CHECK:       ## %bb.0:
   3171 ; CHECK-NEXT:    kmovw %edi, %k1
   3172 ; CHECK-NEXT:    vcvtpd2ps %zmm0, %ymm1 {%k1}
   3173 ; CHECK-NEXT:    vcvtpd2ps {ru-sae}, %zmm0, %ymm0
   3174 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
   3175 ; CHECK-NEXT:    retq
   3176   %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
   3177   %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 2)
   3178   %res2 = fadd <8 x float> %res, %res1
   3179   ret <8 x float> %res2
   3180 }
   3181 
   3182 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
   3183 
   3184 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
   3185 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
   3186 ; CHECK:       ## %bb.0:
   3187 ; CHECK-NEXT:    kmovw %edi, %k1
   3188 ; CHECK-NEXT:    vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
   3189 ; CHECK-NEXT:    vcvtpd2udq {rn-sae}, %zmm0, %ymm0
   3190 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   3191 ; CHECK-NEXT:    retq
   3192   %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 2)
   3193   %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
   3194   %res2 = add <8 x i32> %res, %res1
   3195   ret <8 x i32> %res2
   3196 }
   3197 
   3198 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
   3199 
   3200 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
   3201 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
   3202 ; CHECK:       ## %bb.0:
   3203 ; CHECK-NEXT:    kmovw %edi, %k1
   3204 ; CHECK-NEXT:    vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
   3205 ; CHECK-NEXT:    vcvtps2dq {rn-sae}, %zmm0, %zmm0
   3206 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   3207 ; CHECK-NEXT:    retq
   3208   %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
   3209   %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
   3210   %res2 = add <16 x i32> %res, %res1
   3211   ret <16 x i32> %res2
   3212 }
   3213 
   3214 declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
   3215 
   3216 define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
   3217 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
   3218 ; CHECK:       ## %bb.0:
   3219 ; CHECK-NEXT:    kmovw %edi, %k1
   3220 ; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm1 {%k1}
   3221 ; CHECK-NEXT:    vcvtps2pd {sae}, %ymm0, %zmm0
   3222 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
   3223 ; CHECK-NEXT:    retq
   3224   %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
   3225   %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
   3226   %res2 = fadd <8 x double> %res, %res1
   3227   ret <8 x double> %res2
   3228 }
   3229 
   3230 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
   3231 
   3232 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
   3233 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
   3234 ; CHECK:       ## %bb.0:
   3235 ; CHECK-NEXT:    kmovw %edi, %k1
   3236 ; CHECK-NEXT:    vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
   3237 ; CHECK-NEXT:    vcvtps2udq {rn-sae}, %zmm0, %zmm0
   3238 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   3239 ; CHECK-NEXT:    retq
   3240   %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
   3241   %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
   3242   %res2 = add <16 x i32> %res, %res1
   3243   ret <16 x i32> %res2
   3244 }
   3245 
   3246 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
   3247 
   3248 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
   3249 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
   3250 ; CHECK:       ## %bb.0:
   3251 ; CHECK-NEXT:    kmovw %edi, %k1
   3252 ; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm1 {%k1}
   3253 ; CHECK-NEXT:    vcvttpd2dq {sae}, %zmm0, %ymm0
   3254 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   3255 ; CHECK-NEXT:    retq
   3256   %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
   3257   %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
   3258   %res2 = add <8 x i32> %res, %res1
   3259   ret <8 x i32> %res2
   3260 }
   3261 
   3262 declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
   3263 
   3264 define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
   3265 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
   3266 ; CHECK:       ## %bb.0:
   3267 ; CHECK-NEXT:    kmovw %edi, %k1
   3268 ; CHECK-NEXT:    vcvtudq2ps %zmm0, %zmm1 {%k1}
   3269 ; CHECK-NEXT:    vcvtudq2ps {rn-sae}, %zmm0, %zmm0
   3270 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   3271 ; CHECK-NEXT:    retq
   3272   %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
   3273   %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
   3274   %res2 = fadd <16 x float> %res, %res1
   3275   ret <16 x float> %res2
   3276 }
   3277 
   3278 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
   3279 
   3280 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
   3281 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
   3282 ; CHECK:       ## %bb.0:
   3283 ; CHECK-NEXT:    kmovw %edi, %k1
   3284 ; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm1 {%k1}
   3285 ; CHECK-NEXT:    vcvttpd2udq {sae}, %zmm0, %ymm0
   3286 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   3287 ; CHECK-NEXT:    retq
   3288   %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
   3289   %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
   3290   %res2 = add <8 x i32> %res, %res1
   3291   ret <8 x i32> %res2
   3292 }
   3293 
   3294 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
   3295 
   3296 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
   3297 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
   3298 ; CHECK:       ## %bb.0:
   3299 ; CHECK-NEXT:    kmovw %edi, %k1
   3300 ; CHECK-NEXT:    vcvttps2dq %zmm0, %zmm1 {%k1}
   3301 ; CHECK-NEXT:    vcvttps2dq {sae}, %zmm0, %zmm0
   3302 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   3303 ; CHECK-NEXT:    retq
   3304   %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
   3305   %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
   3306   %res2 = add <16 x i32> %res, %res1
   3307   ret <16 x i32> %res2
   3308 }
   3309 
   3310 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
   3311 
   3312 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
   3313 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
   3314 ; CHECK:       ## %bb.0:
   3315 ; CHECK-NEXT:    kmovw %edi, %k1
   3316 ; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm1 {%k1}
   3317 ; CHECK-NEXT:    vcvttps2udq {sae}, %zmm0, %zmm0
   3318 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   3319 ; CHECK-NEXT:    retq
   3320   %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
   3321   %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
   3322   %res2 = add <16 x i32> %res, %res1
   3323   ret <16 x i32> %res2
   3324 }
   3325 
   3326 declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
   3327 
   3328 define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   3329 ; CHECK-LABEL: test_getexp_ss:
   3330 ; CHECK:       ## %bb.0:
   3331 ; CHECK-NEXT:    kmovw %edi, %k1
   3332 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
   3333 ; CHECK-NEXT:    vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
   3334 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
   3335 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm5
   3336 ; CHECK-NEXT:    vaddps %xmm5, %xmm4, %xmm4
   3337 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3338 ; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm0
   3339 ; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
   3340 ; CHECK-NEXT:    retq
   3341   %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   3342   %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
   3343   %res2 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
   3344   %res3 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
   3345 
   3346   %res.1 = fadd <4 x float> %res0, %res1
   3347   %res.2 = fadd <4 x float> %res2, %res3
   3348   %res   = fadd <4 x float> %res.1, %res.2
   3349   ret <4 x float> %res
   3350 }
   3351 
   3352 declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
   3353 
   3354 define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   3355 ; CHECK-LABEL: test_getexp_sd:
   3356 ; CHECK:       ## %bb.0:
   3357 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3
   3358 ; CHECK-NEXT:    kmovw %edi, %k1
   3359 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
   3360 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4 {%k1}
   3361 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z}
   3362 ; CHECK-NEXT:    vaddpd %xmm3, %xmm5, %xmm3
   3363 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3364 ; CHECK-NEXT:    vaddpd %xmm2, %xmm4, %xmm0
   3365 ; CHECK-NEXT:    vaddpd %xmm3, %xmm0, %xmm0
   3366 ; CHECK-NEXT:    retq
   3367   %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   3368   %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
   3369   %res2 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
   3370   %res3 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
   3371 
   3372   %res.1 = fadd <2 x double> %res0, %res1
   3373   %res.2 = fadd <2 x double> %res2, %res3
   3374   %res   = fadd <2 x double> %res.1, %res.2
   3375   ret <2 x double> %res
   3376 }
   3377 
   3378 declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
   3379 
   3380 define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
   3381 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd:
   3382 ; CHECK:       ## %bb.0:
   3383 ; CHECK-NEXT:    kmovw %edi, %k1
   3384 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
   3385 ; CHECK-NEXT:    kmovw %k0, %eax
   3386 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
   3387 ; CHECK-NEXT:    retq
   3388 
   3389   %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
   3390   ret i8 %res4
   3391 }
   3392 
   3393 define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
   3394 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
   3395 ; CHECK:       ## %bb.0:
   3396 ; CHECK-NEXT:    vcmplesd %xmm1, %xmm0, %k0
   3397 ; CHECK-NEXT:    kmovw %k0, %ecx
   3398 ; CHECK-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
   3399 ; CHECK-NEXT:    kmovw %k0, %edx
   3400 ; CHECK-NEXT:    kmovw %edi, %k1
   3401 ; CHECK-NEXT:    vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
   3402 ; CHECK-NEXT:    kmovw %k0, %esi
   3403 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
   3404 ; CHECK-NEXT:    kmovw %k0, %eax
   3405 ; CHECK-NEXT:    orb %sil, %al
   3406 ; CHECK-NEXT:    orb %dl, %al
   3407 ; CHECK-NEXT:    orb %cl, %al
   3408 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
   3409 ; CHECK-NEXT:    retq
   3410 
   3411   %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
   3412   %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
   3413   %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
   3414   %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
   3415 
   3416   %res11 = or i8 %res1, %res2
   3417   %res12 = or i8 %res3, %res4
   3418   %res13 = or i8 %res11, %res12
   3419   ret i8 %res13
   3420 }
   3421 
   3422 declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
   3423 
   3424 define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
   3425 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
   3426 ; CHECK:       ## %bb.0:
   3427 ; CHECK-NEXT:    kmovw %edi, %k1
   3428 ; CHECK-NEXT:    vcmpunordss %xmm1, %xmm0, %k0 {%k1}
   3429 ; CHECK-NEXT:    kmovw %k0, %eax
   3430 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
   3431 ; CHECK-NEXT:    retq
   3432 
   3433   %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
   3434   ret i8 %res2
   3435 }
   3436 
   3437 
   3438 define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
   3439 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
   3440 ; CHECK:       ## %bb.0:
   3441 ; CHECK-NEXT:    vcmpless %xmm1, %xmm0, %k0
   3442 ; CHECK-NEXT:    kmovw %k0, %ecx
   3443 ; CHECK-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0
   3444 ; CHECK-NEXT:    kmovw %k0, %edx
   3445 ; CHECK-NEXT:    kmovw %edi, %k1
   3446 ; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k0 {%k1}
   3447 ; CHECK-NEXT:    kmovw %k0, %esi
   3448 ; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
   3449 ; CHECK-NEXT:    kmovw %k0, %eax
   3450 ; CHECK-NEXT:    andb %sil, %al
   3451 ; CHECK-NEXT:    andb %dl, %al
   3452 ; CHECK-NEXT:    andb %cl, %al
   3453 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
   3454 ; CHECK-NEXT:    retq
   3455   %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
   3456   %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
   3457   %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
   3458   %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
   3459 
   3460   %res11 = and i8 %res1, %res2
   3461   %res12 = and i8 %res3, %res4
   3462   %res13 = and i8 %res11, %res12
   3463   ret i8 %res13
   3464 }
   3465 
   3466 declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
   3467 
   3468 define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
   3469 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
   3470 ; CHECK:       ## %bb.0:
   3471 ; CHECK-NEXT:    kmovw %edi, %k1
   3472 ; CHECK-NEXT:    vgetmantpd $11, %zmm0, %zmm1 {%k1}
   3473 ; CHECK-NEXT:    vgetmantpd $11, {sae}, %zmm0, %zmm0
   3474 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
   3475 ; CHECK-NEXT:    retq
   3476   %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
   3477   %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
   3478   %res2 = fadd <8 x double> %res, %res1
   3479   ret <8 x double> %res2
   3480 }
   3481 
   3482 declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
   3483 
   3484 define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
   3485 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
   3486 ; CHECK:       ## %bb.0:
   3487 ; CHECK-NEXT:    kmovw %edi, %k1
   3488 ; CHECK-NEXT:    vgetmantps $11, %zmm0, %zmm1 {%k1}
   3489 ; CHECK-NEXT:    vgetmantps $11, {sae}, %zmm0, %zmm0
   3490 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
   3491 ; CHECK-NEXT:    retq
   3492   %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
   3493   %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
   3494   %res2 = fadd <16 x float> %res, %res1
   3495   ret <16 x float> %res2
   3496 }
   3497 
   3498 declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
   3499 
   3500 define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   3501 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd:
   3502 ; CHECK:       ## %bb.0:
   3503 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3
   3504 ; CHECK-NEXT:    kmovw %edi, %k1
   3505 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
   3506 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1}
   3507 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z}
   3508 ; CHECK-NEXT:    vaddpd %xmm5, %xmm4, %xmm4
   3509 ; CHECK-NEXT:    vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3510 ; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm0
   3511 ; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
   3512 ; CHECK-NEXT:    retq
   3513   %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
   3514   %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
   3515   %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 8)
   3516   %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 -1, i32 4)
   3517   %res11 = fadd <2 x double> %res, %res1
   3518   %res12 = fadd <2 x double> %res2, %res3
   3519   %res13 = fadd <2 x double> %res11, %res12
   3520   ret <2 x double> %res13
   3521 }
   3522 
   3523 declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
   3524 
   3525 define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   3526 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss:
   3527 ; CHECK:       ## %bb.0:
   3528 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3
   3529 ; CHECK-NEXT:    kmovw %edi, %k1
   3530 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
   3531 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
   3532 ; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm2
   3533 ; CHECK-NEXT:    vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
   3534 ; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0
   3535 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
   3536 ; CHECK-NEXT:    retq
   3537   %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
   3538   %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
   3539   %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 8)
   3540   %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 4)
   3541   %res11 = fadd <4 x float> %res, %res1
   3542   %res12 = fadd <4 x float> %res2, %res3
   3543   %res13 = fadd <4 x float> %res11, %res12
   3544   ret <4 x float> %res13
   3545 }
   3546 
   3547 declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
   3548 
   3549 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) {
   3550 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512:
   3551 ; CHECK:       ## %bb.0:
   3552 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0
   3553 ; CHECK-NEXT:    retq
   3554   %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
   3555   ret <8 x double> %res
   3556 }
   3557 
   3558 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) {
   3559 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask:
   3560 ; CHECK:       ## %bb.0:
   3561 ; CHECK-NEXT:    kmovw %edi, %k1
   3562 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
   3563 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
   3564 ; CHECK-NEXT:    retq
   3565   %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
   3566   %mask.cast = bitcast i8 %mask to <8 x i1>
   3567   %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2
   3568   ret <8 x double> %res2
   3569 }
   3570 
   3571 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) {
   3572 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz:
   3573 ; CHECK:       ## %bb.0:
   3574 ; CHECK-NEXT:    kmovw %edi, %k1
   3575 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   3576 ; CHECK-NEXT:    retq
   3577   %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
   3578   %mask.cast = bitcast i8 %mask to <8 x i1>
   3579   %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer
   3580   ret <8 x double> %res2
   3581 }
   3582 
   3583 declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
   3584 
   3585 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) {
   3586 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512:
   3587 ; CHECK:       ## %bb.0:
   3588 ; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm0
   3589 ; CHECK-NEXT:    retq
   3590   %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
   3591   ret <16 x float> %res
   3592 }
   3593 
   3594 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
   3595 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask:
   3596 ; CHECK:       ## %bb.0:
   3597 ; CHECK-NEXT:    kmovw %edi, %k1
   3598 ; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm2 {%k1}
   3599 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3600 ; CHECK-NEXT:    retq
   3601   %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
   3602   %mask.cast = bitcast i16 %mask to <16 x i1>
   3603   %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
   3604   ret <16 x float> %res2
   3605 }
   3606 
   3607 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
   3608 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz:
   3609 ; CHECK:       ## %bb.0:
   3610 ; CHECK-NEXT:    kmovw %edi, %k1
   3611 ; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
   3612 ; CHECK-NEXT:    retq
   3613   %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
   3614   %mask.cast = bitcast i16 %mask to <16 x i1>
   3615   %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
   3616   ret <16 x float> %res2
   3617 }
   3618 
   3619 ; Test case to make sure we can print shuffle decode comments for constant pool loads.
   3620 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) {
   3621 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool:
   3622 ; CHECK:       ## %bb.0:
   3623 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
   3624 ; CHECK-NEXT:    retq
   3625   %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
   3626   ret <16 x float> %res
   3627 }
   3628 
   3629 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
   3630 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask:
   3631 ; CHECK:       ## %bb.0:
   3632 ; CHECK-NEXT:    kmovw %edi, %k1
   3633 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
   3634 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   3635 ; CHECK-NEXT:    retq
   3636   %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
   3637   %mask.cast = bitcast i16 %mask to <16 x i1>
   3638   %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
   3639   ret <16 x float> %res2
   3640 }
   3641 
   3642 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
   3643 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz:
   3644 ; CHECK:       ## %bb.0:
   3645 ; CHECK-NEXT:    kmovw %edi, %k1
   3646 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
   3647 ; CHECK-NEXT:    retq
   3648   %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
   3649   %mask.cast = bitcast i16 %mask to <16 x i1>
   3650   %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
   3651   ret <16 x float> %res2
   3652 }
   3653 
   3654 declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
   3655 
   3656 define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
   3657 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
   3658 ; CHECK:       ## %bb.0:
   3659 ; CHECK-NEXT:    kmovw %edi, %k1
   3660 ; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
   3661 ; CHECK-NEXT:    vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
   3662 ; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
   3663 ; CHECK-NEXT:    retq
   3664   %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
   3665   %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
   3666   %res2 = fadd <2 x double> %res, %res1
   3667   ret <2 x double> %res2
   3668 }
   3669 
   3670 declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
   3671 
   3672 define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
   3673 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
   3674 ; CHECK:       ## %bb.0:
   3675 ; CHECK-NEXT:    kmovw %edi, %k1
   3676 ; CHECK-NEXT:    vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   3677 ; CHECK-NEXT:    vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
   3678 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
   3679 ; CHECK-NEXT:    retq
   3680   %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
   3681   %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
   3682   %res2 = fadd <4 x float> %res, %res1
   3683   ret <4 x float> %res2
   3684 }
   3685 
   3686 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
   3687 
   3688 define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
   3689 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
   3690 ; CHECK:       ## %bb.0:
   3691 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
   3692 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3
   3693 ; CHECK-NEXT:    kmovw %edi, %k1
   3694 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
   3695 ; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
   3696 ; CHECK-NEXT:    retq
   3697   %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
   3698   %2 = bitcast i16 %x4 to <16 x i1>
   3699   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
   3700   %4 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
   3701   %res2 = add <16 x i32> %3, %4
   3702   ret <16 x i32> %res2
   3703 }
   3704 
   3705 define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
   3706 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
   3707 ; CHECK:       ## %bb.0:
   3708 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
   3709 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3
   3710 ; CHECK-NEXT:    kmovw %edi, %k1
   3711 ; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3712 ; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
   3713 ; CHECK-NEXT:    retq
   3714   %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
   3715   %2 = bitcast i16 %x4 to <16 x i1>
   3716   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
   3717   %4 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
   3718   %res2 = add <16 x i32> %3, %4
   3719   ret <16 x i32> %res2
   3720 }
   3721 
   3722 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
   3723 
   3724 define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
   3725 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
   3726 ; CHECK:       ## %bb.0:
   3727 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
   3728 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3
   3729 ; CHECK-NEXT:    kmovw %edi, %k1
   3730 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
   3731 ; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
   3732 ; CHECK-NEXT:    retq
   3733   %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
   3734   %2 = bitcast i8 %x4 to <8 x i1>
   3735   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
   3736   %4 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
   3737   %res2 = add <8 x i64> %3, %4
   3738   ret <8 x i64> %res2
   3739 }
   3740 
   3741 define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
   3742 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
   3743 ; CHECK:       ## %bb.0:
   3744 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
   3745 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3
   3746 ; CHECK-NEXT:    kmovw %edi, %k1
   3747 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3748 ; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
   3749 ; CHECK-NEXT:    retq
   3750   %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
   3751   %2 = bitcast i8 %x4 to <8 x i1>
   3752   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
   3753   %4 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
   3754   %res2 = add <8 x i64> %3, %4
   3755   ret <8 x i64> %res2
   3756 }
   3757 
   3758 define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
   3759 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
   3760 ; CHECK:       ## %bb.0:
   3761 ; CHECK-NEXT:    vcmpeqsd {sae}, %xmm1, %xmm0, %k0
   3762 ; CHECK-NEXT:    kmovw %k0, %eax
   3763 ; CHECK-NEXT:    retq
   3764   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
   3765   ret i32 %res
   3766 }
   3767 
   3768 define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
   3769 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
   3770 ; CHECK:       ## %bb.0:
   3771 ; CHECK-NEXT:    vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0
   3772 ; CHECK-NEXT:    kmovw %k0, %eax
   3773 ; CHECK-NEXT:    retq
   3774   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
   3775   ret i32 %res
   3776 }
   3777 
   3778 define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
   3779 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
   3780 ; CHECK:       ## %bb.0:
   3781 ; CHECK-NEXT:    vcmpeqsd %xmm1, %xmm0, %k0
   3782 ; CHECK-NEXT:    kmovw %k0, %eax
   3783 ; CHECK-NEXT:    retq
   3784   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
   3785   ret i32 %res
   3786 }
   3787 
   3788 define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
   3789 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
   3790 ; CHECK:       ## %bb.0:
   3791 ; CHECK-NEXT:    vcmpeq_uqsd %xmm1, %xmm0, %k0
   3792 ; CHECK-NEXT:    kmovw %k0, %eax
   3793 ; CHECK-NEXT:    retq
   3794   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
   3795   ret i32 %res
   3796 }
   3797 
   3798 define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
   3799 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
   3800 ; CHECK:       ## %bb.0:
   3801 ; CHECK-NEXT:    vcmpltsd {sae}, %xmm1, %xmm0, %k0
   3802 ; CHECK-NEXT:    kmovw %k0, %eax
   3803 ; CHECK-NEXT:    retq
   3804   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
   3805   ret i32 %res
   3806 }
   3807 
   3808 define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
   3809 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
   3810 ; CHECK:       ## %bb.0:
   3811 ; CHECK-NEXT:    vcmpngesd {sae}, %xmm1, %xmm0, %k0
   3812 ; CHECK-NEXT:    kmovw %k0, %eax
   3813 ; CHECK-NEXT:    retq
   3814   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
   3815   ret i32 %res
   3816 }
   3817 
   3818 define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
   3819 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
   3820 ; CHECK:       ## %bb.0:
   3821 ; CHECK-NEXT:    vcmpltsd %xmm1, %xmm0, %k0
   3822 ; CHECK-NEXT:    kmovw %k0, %eax
   3823 ; CHECK-NEXT:    retq
   3824   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
   3825   ret i32 %res
   3826 }
   3827 
   3828 define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
   3829 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
   3830 ; CHECK:       ## %bb.0:
   3831 ; CHECK-NEXT:    vcmpngesd %xmm1, %xmm0, %k0
   3832 ; CHECK-NEXT:    kmovw %k0, %eax
   3833 ; CHECK-NEXT:    retq
   3834   %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
   3835   ret i32 %res
   3836 }
   3837 
   3838 declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
   3839 
   3840 define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
   3841 ; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
   3842 ; CHECK:       ## %bb.0:
   3843 ; CHECK-NEXT:    vcmpngess %xmm1, %xmm0, %k0
   3844 ; CHECK-NEXT:    kmovw %k0, %eax
   3845 ; CHECK-NEXT:    retq
   3846   %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
   3847   ret i32 %res
   3848 }
   3849 
   3850 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
   3851 
   3852 declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>)
   3853 
   3854 define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   3855 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_512:
   3856 ; CHECK:       ## %bb.0:
   3857 ; CHECK-NEXT:    vprolvd %zmm1, %zmm0, %zmm3
   3858 ; CHECK-NEXT:    kmovw %edi, %k1
   3859 ; CHECK-NEXT:    vprolvd %zmm1, %zmm0, %zmm2 {%k1}
   3860 ; CHECK-NEXT:    vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   3861 ; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
   3862 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   3863 ; CHECK-NEXT:    retq
   3864   %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
   3865   %2 = bitcast i16 %x3 to <16 x i1>
   3866   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
   3867   %4 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
   3868   %5 = bitcast i16 %x3 to <16 x i1>
   3869   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   3870   %7 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
   3871   %res3 = add <16 x i32> %3, %6
   3872   %res4 = add <16 x i32> %res3, %7
   3873   ret <16 x i32> %res4
   3874 }
   3875 
   3876 declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>)
   3877 
   3878 define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   3879 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_512:
   3880 ; CHECK:       ## %bb.0:
   3881 ; CHECK-NEXT:    vprolvq %zmm1, %zmm0, %zmm3
   3882 ; CHECK-NEXT:    kmovw %edi, %k1
   3883 ; CHECK-NEXT:    vprolvq %zmm1, %zmm0, %zmm2 {%k1}
   3884 ; CHECK-NEXT:    vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   3885 ; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
   3886 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   3887 ; CHECK-NEXT:    retq
   3888   %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
   3889   %2 = bitcast i8 %x3 to <8 x i1>
   3890   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
   3891   %4 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
   3892   %5 = bitcast i8 %x3 to <8 x i1>
   3893   %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
   3894   %7 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
   3895   %res3 = add <8 x i64> %3, %6
   3896   %res4 = add <8 x i64> %res3, %7
   3897   ret <8 x i64> %res4
   3898 }
   3899 
   3900 declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>)
   3901 
   3902 define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   3903 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
   3904 ; CHECK:       ## %bb.0:
   3905 ; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3
   3906 ; CHECK-NEXT:    kmovw %edi, %k1
   3907 ; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm2 {%k1}
   3908 ; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   3909 ; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
   3910 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   3911 ; CHECK-NEXT:    retq
   3912   %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
   3913   %2 = bitcast i16 %x3 to <16 x i1>
   3914   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
   3915   %4 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
   3916   %5 = bitcast i16 %x3 to <16 x i1>
   3917   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   3918   %7 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
   3919   %res3 = add <16 x i32> %3, %6
   3920   %res4 = add <16 x i32> %res3, %7
   3921   ret <16 x i32> %res4
   3922 }
   3923 
   3924 declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>)
   3925 
   3926 define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   3927 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
   3928 ; CHECK:       ## %bb.0:
   3929 ; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3
   3930 ; CHECK-NEXT:    kmovw %edi, %k1
   3931 ; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm2 {%k1}
   3932 ; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   3933 ; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
   3934 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   3935 ; CHECK-NEXT:    retq
   3936   %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
   3937   %2 = bitcast i8 %x3 to <8 x i1>
   3938   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
   3939   %4 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
   3940   %5 = bitcast i8 %x3 to <8 x i1>
   3941   %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
   3942   %7 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
   3943   %res3 = add <8 x i64> %3, %6
   3944   %res4 = add <8 x i64> %res3, %7
   3945   ret <8 x i64> %res4
   3946 }
   3947 
   3948 declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32)
   3949 
   3950 define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
   3951 ; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512:
   3952 ; CHECK:       ## %bb.0:
   3953 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm2
   3954 ; CHECK-NEXT:    kmovw %esi, %k1
   3955 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm1 {%k1}
   3956 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm0 {%k1} {z}
   3957 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
   3958 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   3959 ; CHECK-NEXT:    retq
   3960   %1 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 3)
   3961   %2 = bitcast i16 %x3 to <16 x i1>
   3962   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
   3963   %4 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 3)
   3964   %5 = bitcast i16 %x3 to <16 x i1>
   3965   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   3966   %7 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 3)
   3967   %res3 = add <16 x i32> %3, %6
   3968   %res4 = add <16 x i32> %res3, %7
   3969   ret <16 x i32> %res4
   3970 }
   3971 
   3972 declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32)
   3973 
   3974 define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
   3975 ; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512:
   3976 ; CHECK:       ## %bb.0:
   3977 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm2
   3978 ; CHECK-NEXT:    kmovw %esi, %k1
   3979 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm1 {%k1}
   3980 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0 {%k1} {z}
   3981 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
   3982 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
   3983 ; CHECK-NEXT:    retq
   3984   %1 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 3)
   3985   %2 = bitcast i8 %x3 to <8 x i1>
   3986   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
   3987   %4 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 3)
   3988   %5 = bitcast i8 %x3 to <8 x i1>
   3989   %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
   3990   %7 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 3)
   3991   %res3 = add <8 x i64> %3, %6
   3992   %res4 = add <8 x i64> %res3, %7
   3993   ret <8 x i64> %res4
   3994 }
   3995 
   3996 declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32)
   3997 
   3998 define <16 x i32>@test_int_x86_avx512_mask_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
   3999 ; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_512:
   4000 ; CHECK:       ## %bb.0:
   4001 ; CHECK-NEXT:    vprord $3, %zmm0, %zmm2
   4002 ; CHECK-NEXT:    kmovw %esi, %k1
   4003 ; CHECK-NEXT:    vprord $3, %zmm0, %zmm1 {%k1}
   4004 ; CHECK-NEXT:    vprord $3, %zmm0, %zmm0 {%k1} {z}
   4005 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
   4006 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   4007 ; CHECK-NEXT:    retq
   4008   %1 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 3)
   4009   %2 = bitcast i16 %x3 to <16 x i1>
   4010   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
   4011   %4 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 3)
   4012   %5 = bitcast i16 %x3 to <16 x i1>
   4013   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   4014   %7 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 3)
   4015   %res3 = add <16 x i32> %3, %6
   4016   %res4 = add <16 x i32> %res3, %7
   4017   ret <16 x i32> %res4
   4018 }
   4019 
   4020 declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32)
   4021 
   4022 define <8 x i64>@test_int_x86_avx512_mask_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
   4023 ; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_512:
   4024 ; CHECK:       ## %bb.0:
   4025 ; CHECK-NEXT:    vprorq $3, %zmm0, %zmm2
   4026 ; CHECK-NEXT:    kmovw %esi, %k1
   4027 ; CHECK-NEXT:    vprorq $3, %zmm0, %zmm1 {%k1}
   4028 ; CHECK-NEXT:    vprorq $3, %zmm0, %zmm0 {%k1} {z}
   4029 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
   4030 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
   4031 ; CHECK-NEXT:    retq
   4032   %1 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 3)
   4033   %2 = bitcast i8 %x3 to <8 x i1>
   4034   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
   4035   %4 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 3)
   4036   %5 = bitcast i8 %x3 to <8 x i1>
   4037   %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
   4038   %7 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 3)
   4039   %res3 = add <8 x i64> %3, %6
   4040   %res4 = add <8 x i64> %res3, %7
   4041   ret <8 x i64> %res4
   4042 }
   4043 
   4044 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
   4045 
   4046 define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
   4047 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512:
   4048 ; CHECK:       ## %bb.0:
   4049 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm3
   4050 ; CHECK-NEXT:    kmovw %edi, %k1
   4051 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm2 {%k1}
   4052 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
   4053 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
   4054 ; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
   4055 ; CHECK-NEXT:    retq
   4056   %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
   4057   %2 = bitcast i8 %x3 to <8 x i1>
   4058   %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2
   4059   %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
   4060   %5 = bitcast i8 %x3 to <8 x i1>
   4061   %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> zeroinitializer
   4062   %7 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
   4063   %res3 = fadd <8 x double> %3, %6
   4064   %res4 = fadd <8 x double> %res3, %7
   4065   ret <8 x double> %res4
   4066 }
   4067 
   4068 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
   4069 
   4070 define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
   4071 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512:
   4072 ; CHECK:       ## %bb.0:
   4073 ; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm3
   4074 ; CHECK-NEXT:    kmovw %edi, %k1
   4075 ; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm2 {%k1}
   4076 ; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
   4077 ; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
   4078 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
   4079 ; CHECK-NEXT:    retq
   4080   %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
   4081   %2 = bitcast i8 %x3 to <8 x i1>
   4082   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
   4083   %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
   4084   %5 = bitcast i8 %x3 to <8 x i1>
   4085   %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
   4086   %7 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
   4087   %res3 = add <8 x i64> %3, %6
   4088   %res4 = add <8 x i64> %res3, %7
   4089   ret <8 x i64> %res4
   4090 }
   4091 
   4092 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
   4093 
   4094 define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
   4095 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
   4096 ; CHECK:       ## %bb.0:
   4097 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm3
   4098 ; CHECK-NEXT:    kmovw %edi, %k1
   4099 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm2 {%k1}
   4100 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
   4101 ; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
   4102 ; CHECK-NEXT:    vaddps %zmm3, %zmm0, %zmm0
   4103 ; CHECK-NEXT:    retq
   4104   %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
   4105   %2 = bitcast i16 %x3 to <16 x i1>
   4106   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2
   4107   %4 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
   4108   %5 = bitcast i16 %x3 to <16 x i1>
   4109   %6 = select <16 x i1> %5, <16 x float> %4, <16 x float> zeroinitializer
   4110   %7 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
   4111   %res3 = fadd <16 x float> %3, %6
   4112   %res4 = fadd <16 x float> %res3, %7
   4113   ret <16 x float> %res4
   4114 }
   4115 
   4116 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
   4117 
   4118 define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
   4119 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512:
   4120 ; CHECK:       ## %bb.0:
   4121 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm3
   4122 ; CHECK-NEXT:    kmovw %edi, %k1
   4123 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm2 {%k1}
   4124 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
   4125 ; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
   4126 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   4127 ; CHECK-NEXT:    retq
   4128   %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
   4129   %2 = bitcast i16 %x3 to <16 x i1>
   4130   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
   4131   %4 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
   4132   %5 = bitcast i16 %x3 to <16 x i1>
   4133   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   4134   %7 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
   4135   %res3 = add <16 x i32> %3, %6
   4136   %res4 = add <16 x i32> %res3, %7
   4137   ret <16 x i32> %res4
   4138 }
   4139 
   4140 declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
   4141 
   4142 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
   4143 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
   4144 ; CHECK:       ## %bb.0:
   4145 ; CHECK-NEXT:    kmovw %edi, %k1
   4146 ; CHECK-NEXT:    vmovapd %zmm0, %zmm3
   4147 ; CHECK-NEXT:    vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
   4148 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
   4149 ; CHECK-NEXT:    vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
   4150 ; CHECK-NEXT:    vaddpd %zmm4, %zmm3, %zmm3
   4151 ; CHECK-NEXT:    vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
   4152 ; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
   4153 ; CHECK-NEXT:    retq
   4154   %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
   4155   %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
   4156   %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
   4157   %res3 = fadd <8 x double> %res, %res1
   4158   %res4 = fadd <8 x double> %res3, %res2
   4159   ret <8 x double> %res4
   4160 }
   4161 
   4162 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, <8 x i64>* %x2ptr) {
   4163 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512_load:
   4164 ; CHECK:       ## %bb.0:
   4165 ; CHECK-NEXT:    vfixupimmpd $3, (%rdi), %zmm1, %zmm0
   4166 ; CHECK-NEXT:    retq
   4167   %x2 = load <8 x i64>, <8 x i64>* %x2ptr
   4168   %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4)
   4169   ret <8 x double> %res
   4170 }
   4171 
   4172 declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
   4173 
   4174 define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
   4175 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
   4176 ; CHECK:       ## %bb.0:
   4177 ; CHECK-NEXT:    kmovw %edi, %k1
   4178 ; CHECK-NEXT:    vmovapd %zmm0, %zmm3
   4179 ; CHECK-NEXT:    vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
   4180 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
   4181 ; CHECK-NEXT:    vmovapd %zmm0, %zmm5
   4182 ; CHECK-NEXT:    vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
   4183 ; CHECK-NEXT:    vaddpd %zmm5, %zmm3, %zmm3
   4184 ; CHECK-NEXT:    vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
   4185 ; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
   4186 ; CHECK-NEXT:    retq
   4187   %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
   4188   %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
   4189   %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
   4190   %res3 = fadd <8 x double> %res, %res1
   4191   %res4 = fadd <8 x double> %res3, %res2
   4192   ret <8 x double> %res4
   4193 }
   4194 
   4195 declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
   4196 
   4197 define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
   4198 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
   4199 ; CHECK:       ## %bb.0:
   4200 ; CHECK-NEXT:    kmovw %edi, %k1
   4201 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
   4202 ; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
   4203 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
   4204 ; CHECK-NEXT:    vmovaps %xmm0, %xmm5
   4205 ; CHECK-NEXT:    vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
   4206 ; CHECK-NEXT:    vaddps %xmm5, %xmm3, %xmm3
   4207 ; CHECK-NEXT:    vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
   4208 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   4209 ; CHECK-NEXT:    retq
   4210   %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
   4211   %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
   4212   %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
   4213   %res3 = fadd <4 x float> %res, %res1
   4214   %res4 = fadd <4 x float> %res3, %res2
   4215   ret <4 x float> %res4
   4216 }
   4217 
   4218 declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
   4219 
   4220 define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
   4221 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
   4222 ; CHECK:       ## %bb.0:
   4223 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
   4224 ; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3
   4225 ; CHECK-NEXT:    kmovw %edi, %k1
   4226 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4
   4227 ; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm4 {%k1} {z}
   4228 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   4229 ; CHECK-NEXT:    vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   4230 ; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
   4231 ; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0
   4232 ; CHECK-NEXT:    retq
   4233   %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
   4234   %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
   4235   %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 4)
   4236   %res3 = fadd <4 x float> %res, %res1
   4237   %res4 = fadd <4 x float> %res3, %res2
   4238   ret <4 x float> %res4
   4239 }
   4240 
   4241 declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
   4242 
   4243 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
   4244 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
   4245 ; CHECK:       ## %bb.0:
   4246 ; CHECK-NEXT:    kmovw %edi, %k1
   4247 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   4248 ; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
   4249 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
   4250 ; CHECK-NEXT:    vmovaps %zmm0, %zmm5
   4251 ; CHECK-NEXT:    vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
   4252 ; CHECK-NEXT:    vaddps %zmm5, %zmm3, %zmm3
   4253 ; CHECK-NEXT:    vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
   4254 ; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
   4255 ; CHECK-NEXT:    retq
   4256   %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
   4257   %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
   4258   %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
   4259   %res3 = fadd <16 x float> %res, %res1
   4260   %res4 = fadd <16 x float> %res3, %res2
   4261   ret <16 x float> %res4
   4262 }
   4263 
   4264 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, <16 x i32>* %x2ptr) {
   4265 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512_load:
   4266 ; CHECK:       ## %bb.0:
   4267 ; CHECK-NEXT:    vfixupimmps $5, (%rdi), %zmm1, %zmm0
   4268 ; CHECK-NEXT:    retq
   4269   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
   4270   %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
   4271   ret <16 x float> %res
   4272 }
   4273 
   4274 declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
   4275 
   4276 define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
   4277 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
   4278 ; CHECK:       ## %bb.0:
   4279 ; CHECK-NEXT:    kmovw %edi, %k1
   4280 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   4281 ; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm3
   4282 ; CHECK-NEXT:    vmovaps %zmm0, %zmm4
   4283 ; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
   4284 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   4285 ; CHECK-NEXT:    vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   4286 ; CHECK-NEXT:    vaddps %zmm0, %zmm4, %zmm0
   4287 ; CHECK-NEXT:    vaddps %zmm3, %zmm0, %zmm0
   4288 ; CHECK-NEXT:    retq
   4289   %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
   4290   %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 8)
   4291   %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
   4292   %res3 = fadd <16 x float> %res, %res1
   4293   %res4 = fadd <16 x float> %res3, %res2
   4294   ret <16 x float> %res4
   4295 }
   4296 
   4297 declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
   4298 
   4299 define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
   4300 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
   4301 ; CHECK:       ## %bb.0:
   4302 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
   4303 ; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3
   4304 ; CHECK-NEXT:    kmovw %edi, %k1
   4305 ; CHECK-NEXT:    vmovapd %xmm0, %xmm4
   4306 ; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm4 {%k1}
   4307 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
   4308 ; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
   4309 ; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
   4310 ; CHECK-NEXT:    vaddpd %xmm3, %xmm0, %xmm0
   4311 ; CHECK-NEXT:    retq
   4312   %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
   4313   %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
   4314   %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 -1, i32 4)
   4315   %res3 = fadd <2 x double> %res, %res1
   4316   %res4 = fadd <2 x double> %res3, %res2
   4317   ret <2 x double> %res4
   4318 }
   4319 
   4320 declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
   4321 
   4322 define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
   4323 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
   4324 ; CHECK:       ## %bb.0:
   4325 ; CHECK-NEXT:    kmovw %edi, %k1
   4326 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
   4327 ; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
   4328 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
   4329 ; CHECK-NEXT:    vmovapd %xmm0, %xmm5
   4330 ; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
   4331 ; CHECK-NEXT:    vaddpd %xmm5, %xmm3, %xmm3
   4332 ; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   4333 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   4334 ; CHECK-NEXT:    retq
   4335   %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
   4336   %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
   4337   %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
   4338   %res3 = fadd <2 x double> %res, %res1
   4339   %res4 = fadd <2 x double> %res3, %res2
   4340   ret <2 x double> %res4
   4341 }
   4342 
   4343 declare double @llvm.fma.f64(double, double, double) #1
   4344 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0
   4345 
   4346 define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
   4347 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
   4348 ; CHECK:       ## %bb.0:
   4349 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
   4350 ; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
   4351 ; CHECK-NEXT:    kmovw %edi, %k1
   4352 ; CHECK-NEXT:    vmovapd %xmm0, %xmm4
   4353 ; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
   4354 ; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
   4355 ; CHECK-NEXT:    vmovapd %xmm0, %xmm4
   4356 ; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
   4357 ; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   4358 ; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
   4359 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   4360 ; CHECK-NEXT:    retq
   4361   %1 = extractelement <2 x double> %x0, i64 0
   4362   %2 = extractelement <2 x double> %x1, i64 0
   4363   %3 = extractelement <2 x double> %x2, i64 0
   4364   %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
   4365   %5 = insertelement <2 x double> %x0, double %4, i64 0
   4366   %6 = extractelement <2 x double> %x0, i64 0
   4367   %7 = extractelement <2 x double> %x1, i64 0
   4368   %8 = extractelement <2 x double> %x2, i64 0
   4369   %9 = call double @llvm.fma.f64(double %6, double %7, double %8)
   4370   %10 = bitcast i8 %x3 to <8 x i1>
   4371   %11 = extractelement <8 x i1> %10, i64 0
   4372   %12 = select i1 %11, double %9, double %6
   4373   %13 = insertelement <2 x double> %x0, double %12, i64 0
   4374   %14 = extractelement <2 x double> %x0, i64 0
   4375   %15 = extractelement <2 x double> %x1, i64 0
   4376   %16 = extractelement <2 x double> %x2, i64 0
   4377   %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 3)
   4378   %18 = insertelement <2 x double> %x0, double %17, i64 0
   4379   %19 = extractelement <2 x double> %x0, i64 0
   4380   %20 = extractelement <2 x double> %x1, i64 0
   4381   %21 = extractelement <2 x double> %x2, i64 0
   4382   %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 3)
   4383   %23 = bitcast i8 %x3 to <8 x i1>
   4384   %24 = extractelement <8 x i1> %23, i64 0
   4385   %25 = select i1 %24, double %22, double %19
   4386   %26 = insertelement <2 x double> %x0, double %25, i64 0
   4387   %res4 = fadd <2 x double> %5, %13
   4388   %res5 = fadd <2 x double> %18, %26
   4389   %res6 = fadd <2 x double> %res4, %res5
   4390   ret <2 x double> %res6
   4391 }
   4392 
   4393 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
   4394 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
   4395 ; CHECK:       ## %bb.0:
   4396 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
   4397 ; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
   4398 ; CHECK-NEXT:    kmovw %edi, %k1
   4399 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4
   4400 ; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
   4401 ; CHECK-NEXT:    vaddps %xmm4, %xmm3, %xmm3
   4402 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4
   4403 ; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
   4404 ; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   4405 ; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
   4406 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   4407 ; CHECK-NEXT:    retq
   4408   %1 = extractelement <4 x float> %x0, i64 0
   4409   %2 = extractelement <4 x float> %x1, i64 0
   4410   %3 = extractelement <4 x float> %x2, i64 0
   4411   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
   4412   %5 = insertelement <4 x float> %x0, float %4, i64 0
   4413   %6 = extractelement <4 x float> %x0, i64 0
   4414   %7 = extractelement <4 x float> %x1, i64 0
   4415   %8 = extractelement <4 x float> %x2, i64 0
   4416   %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
   4417   %10 = bitcast i8 %x3 to <8 x i1>
   4418   %11 = extractelement <8 x i1> %10, i64 0
   4419   %12 = select i1 %11, float %9, float %6
   4420   %13 = insertelement <4 x float> %x0, float %12, i64 0
   4421   %14 = extractelement <4 x float> %x0, i64 0
   4422   %15 = extractelement <4 x float> %x1, i64 0
   4423   %16 = extractelement <4 x float> %x2, i64 0
   4424   %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 3)
   4425   %18 = insertelement <4 x float> %x0, float %17, i64 0
   4426   %19 = extractelement <4 x float> %x0, i64 0
   4427   %20 = extractelement <4 x float> %x1, i64 0
   4428   %21 = extractelement <4 x float> %x2, i64 0
   4429   %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 3)
   4430   %23 = bitcast i8 %x3 to <8 x i1>
   4431   %24 = extractelement <8 x i1> %23, i64 0
   4432   %25 = select i1 %24, float %22, float %19
   4433   %26 = insertelement <4 x float> %x0, float %25, i64 0
   4434   %res4 = fadd <4 x float> %5, %13
   4435   %res5 = fadd <4 x float> %18, %26
   4436   %res6 = fadd <4 x float> %res4, %res5
   4437   ret <4 x float> %res6
   4438 }
   4439 
   4440 define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
   4441 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
   4442 ; CHECK:       ## %bb.0:
   4443 ; CHECK-NEXT:    kmovw %edi, %k1
   4444 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
   4445 ; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
   4446 ; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   4447 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   4448 ; CHECK-NEXT:    retq
   4449   %1 = extractelement <2 x double> %x0, i64 0
   4450   %2 = extractelement <2 x double> %x1, i64 0
   4451   %3 = extractelement <2 x double> %x2, i64 0
   4452   %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
   4453   %5 = bitcast i8 %x3 to <8 x i1>
   4454   %6 = extractelement <8 x i1> %5, i64 0
   4455   %7 = select i1 %6, double %4, double 0.000000e+00
   4456   %8 = insertelement <2 x double> %x0, double %7, i64 0
   4457   %9 = extractelement <2 x double> %x0, i64 0
   4458   %10 = extractelement <2 x double> %x1, i64 0
   4459   %11 = extractelement <2 x double> %x2, i64 0
   4460   %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 3)
   4461   %13 = bitcast i8 %x3 to <8 x i1>
   4462   %14 = extractelement <8 x i1> %13, i64 0
   4463   %15 = select i1 %14, double %12, double 0.000000e+00
   4464   %16 = insertelement <2 x double> %x0, double %15, i64 0
   4465   %res2 = fadd <2 x double> %8, %16
   4466   ret <2 x double> %res2
   4467 }
   4468 
   4469 declare float @llvm.fma.f32(float, float, float) #1
   4470 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0
   4471 
   4472 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
   4473 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
   4474 ; CHECK:       ## %bb.0:
   4475 ; CHECK-NEXT:    kmovw %edi, %k1
   4476 ; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4477 ; CHECK-NEXT:    retq
   4478   %1 = extractelement <4 x float> %x0, i64 0
   4479   %2 = extractelement <4 x float> %x1, i64 0
   4480   %3 = extractelement <4 x float> %x2, i64 0
   4481   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
   4482   %5 = bitcast i8 %x3 to <8 x i1>
   4483   %6 = extractelement <8 x i1> %5, i64 0
   4484   %7 = select i1 %6, float %4, float 0.000000e+00
   4485   %8 = insertelement <4 x float> %x0, float %7, i64 0
   4486   %9 = extractelement <4 x float> %x0, i64 0
   4487   %10 = extractelement <4 x float> %x1, i64 0
   4488   %11 = extractelement <4 x float> %x2, i64 0
   4489   %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 3)
   4490   %13 = bitcast i8 %x3 to <8 x i1>
   4491   %14 = extractelement <8 x i1> %13, i64 0
   4492   %15 = select i1 %14, float %12, float 0.000000e+00
   4493   %16 = insertelement <4 x float> %x0, float %15, i64 0
   4494   %res2 = fadd <4 x float> %8, %16
   4495   ret <4 x float> %8
   4496 }
   4497 
   4498 define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
   4499 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
   4500 ; CHECK:       ## %bb.0:
   4501 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
   4502 ; CHECK-NEXT:    vfmadd231sd {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
   4503 ; CHECK-NEXT:    kmovw %edi, %k1
   4504 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
   4505 ; CHECK-NEXT:    vfmadd231sd {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
   4506 ; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
   4507 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
   4508 ; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
   4509 ; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   4510 ; CHECK-NEXT:    vaddpd %xmm2, %xmm4, %xmm0
   4511 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   4512 ; CHECK-NEXT:    retq
   4513   %1 = extractelement <2 x double> %x0, i64 0
   4514   %2 = extractelement <2 x double> %x1, i64 0
   4515   %3 = extractelement <2 x double> %x2, i64 0
   4516   %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
   4517   %5 = insertelement <2 x double> %x2, double %4, i64 0
   4518   %6 = extractelement <2 x double> %x0, i64 0
   4519   %7 = extractelement <2 x double> %x1, i64 0
   4520   %8 = extractelement <2 x double> %x2, i64 0
   4521   %9 = call double @llvm.fma.f64(double %6, double %7, double %8)
   4522   %10 = bitcast i8 %x3 to <8 x i1>
   4523   %11 = extractelement <8 x i1> %10, i64 0
   4524   %12 = select i1 %11, double %9, double %8
   4525   %13 = insertelement <2 x double> %x2, double %12, i64 0
   4526   %14 = extractelement <2 x double> %x0, i64 0
   4527   %15 = extractelement <2 x double> %x1, i64 0
   4528   %16 = extractelement <2 x double> %x2, i64 0
   4529   %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 3)
   4530   %18 = insertelement <2 x double> %x2, double %17, i64 0
   4531   %19 = extractelement <2 x double> %x0, i64 0
   4532   %20 = extractelement <2 x double> %x1, i64 0
   4533   %21 = extractelement <2 x double> %x2, i64 0
   4534   %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 3)
   4535   %23 = bitcast i8 %x3 to <8 x i1>
   4536   %24 = extractelement <8 x i1> %23, i64 0
   4537   %25 = select i1 %24, double %22, double %21
   4538   %26 = insertelement <2 x double> %x2, double %25, i64 0
   4539   %res4 = fadd <2 x double> %5, %13
   4540   %res5 = fadd <2 x double> %18, %26
   4541   %res6 = fadd <2 x double> %res4, %res5
   4542   ret <2 x double> %res6
   4543 }
   4544 
   4545 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
   4546 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
   4547 ; CHECK:       ## %bb.0:
   4548 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
   4549 ; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
   4550 ; CHECK-NEXT:    kmovw %edi, %k1
   4551 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
   4552 ; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
   4553 ; CHECK-NEXT:    vaddps %xmm4, %xmm3, %xmm3
   4554 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
   4555 ; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
   4556 ; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   4557 ; CHECK-NEXT:    vaddps %xmm2, %xmm4, %xmm0
   4558 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   4559 ; CHECK-NEXT:    retq
   4560   %1 = extractelement <4 x float> %x0, i64 0
   4561   %2 = extractelement <4 x float> %x1, i64 0
   4562   %3 = extractelement <4 x float> %x2, i64 0
   4563   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
   4564   %5 = insertelement <4 x float> %x2, float %4, i64 0
   4565   %6 = extractelement <4 x float> %x0, i64 0
   4566   %7 = extractelement <4 x float> %x1, i64 0
   4567   %8 = extractelement <4 x float> %x2, i64 0
   4568   %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
   4569   %10 = bitcast i8 %x3 to <8 x i1>
   4570   %11 = extractelement <8 x i1> %10, i64 0
   4571   %12 = select i1 %11, float %9, float %8
   4572   %13 = insertelement <4 x float> %x2, float %12, i64 0
   4573   %14 = extractelement <4 x float> %x0, i64 0
   4574   %15 = extractelement <4 x float> %x1, i64 0
   4575   %16 = extractelement <4 x float> %x2, i64 0
   4576   %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 3)
   4577   %18 = insertelement <4 x float> %x2, float %17, i64 0
   4578   %19 = extractelement <4 x float> %x0, i64 0
   4579   %20 = extractelement <4 x float> %x1, i64 0
   4580   %21 = extractelement <4 x float> %x2, i64 0
   4581   %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 3)
   4582   %23 = bitcast i8 %x3 to <8 x i1>
   4583   %24 = extractelement <8 x i1> %23, i64 0
   4584   %25 = select i1 %24, float %22, float %21
   4585   %26 = insertelement <4 x float> %x2, float %25, i64 0
   4586   %res4 = fadd <4 x float> %5, %13
   4587   %res5 = fadd <4 x float> %18, %26
   4588   %res6 = fadd <4 x float> %res4, %res5
   4589   ret <4 x float> %res6
   4590 }
   4591 
   4592 define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) {
   4593 ; CHECK-LABEL: fmadd_ss_mask_memfold:
   4594 ; CHECK:       ## %bb.0:
   4595 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   4596 ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   4597 ; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
   4598 ; CHECK-NEXT:    kmovw %edx, %k1
   4599 ; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
   4600 ; CHECK-NEXT:    vmovss %xmm0, (%rdi)
   4601 ; CHECK-NEXT:    retq
   4602   %a.val = load float, float* %a
   4603   %av0 = insertelement <4 x float> undef, float %a.val, i32 0
   4604   %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
   4605   %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
   4606   %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
   4607 
   4608   %b.val = load float, float* %b
   4609   %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
   4610   %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
   4611   %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
   4612   %bv =  insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
   4613   %1 = extractelement <4 x float> %av, i64 0
   4614   %2 = extractelement <4 x float> %bv, i64 0
   4615   %3 = extractelement <4 x float> %av, i64 0
   4616   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
   4617   %5 = bitcast i8 %c to <8 x i1>
   4618   %6 = extractelement <8 x i1> %5, i64 0
   4619   %7 = select i1 %6, float %4, float %1
   4620   %8 = insertelement <4 x float> %av, float %7, i64 0
   4621   %sr = extractelement <4 x float> %8, i32 0
   4622   store float %sr, float* %a
   4623   ret void
   4624 }
   4625 
   4626 define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) {
   4627 ; CHECK-LABEL: fmadd_ss_maskz_memfold:
   4628 ; CHECK:       ## %bb.0:
   4629 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   4630 ; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
   4631 ; CHECK-NEXT:    kmovw %edx, %k1
   4632 ; CHECK-NEXT:    vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
   4633 ; CHECK-NEXT:    vmovss %xmm0, (%rdi)
   4634 ; CHECK-NEXT:    retq
   4635   %a.val = load float, float* %a
   4636   %av0 = insertelement <4 x float> undef, float %a.val, i32 0
   4637   %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
   4638   %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
   4639   %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
   4640 
   4641   %b.val = load float, float* %b
   4642   %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
   4643   %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
   4644   %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
   4645   %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
   4646   %1 = extractelement <4 x float> %av, i64 0
   4647   %2 = extractelement <4 x float> %bv, i64 0
   4648   %3 = extractelement <4 x float> %av, i64 0
   4649   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
   4650   %5 = bitcast i8 %c to <8 x i1>
   4651   %6 = extractelement <8 x i1> %5, i64 0
   4652   %7 = select i1 %6, float %4, float 0.000000e+00
   4653   %8 = insertelement <4 x float> %av, float %7, i64 0
   4654   %sr = extractelement <4 x float> %8, i32 0
   4655   store float %sr, float* %a
   4656   ret void
   4657 }
   4658 
   4659 define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) {
   4660 ; CHECK-LABEL: fmadd_sd_mask_memfold:
   4661 ; CHECK:       ## %bb.0:
   4662 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
   4663 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
   4664 ; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
   4665 ; CHECK-NEXT:    kmovw %edx, %k1
   4666 ; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
   4667 ; CHECK-NEXT:    vmovsd %xmm0, (%rdi)
   4668 ; CHECK-NEXT:    retq
   4669   %a.val = load double, double* %a
   4670   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
   4671   %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
   4672 
   4673   %b.val = load double, double* %b
   4674   %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
   4675   %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
   4676   %1 = extractelement <2 x double> %av, i64 0
   4677   %2 = extractelement <2 x double> %bv, i64 0
   4678   %3 = extractelement <2 x double> %av, i64 0
   4679   %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
   4680   %5 = bitcast i8 %c to <8 x i1>
   4681   %6 = extractelement <8 x i1> %5, i64 0
   4682   %7 = select i1 %6, double %4, double %1
   4683   %8 = insertelement <2 x double> %av, double %7, i64 0
   4684   %sr = extractelement <2 x double> %8, i32 0
   4685   store double %sr, double* %a
   4686   ret void
   4687 }
   4688 
   4689 define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) {
   4690 ; CHECK-LABEL: fmadd_sd_maskz_memfold:
   4691 ; CHECK:       ## %bb.0:
   4692 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
   4693 ; CHECK-NEXT:    vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
   4694 ; CHECK-NEXT:    kmovw %edx, %k1
   4695 ; CHECK-NEXT:    vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z}
   4696 ; CHECK-NEXT:    vmovsd %xmm0, (%rdi)
   4697 ; CHECK-NEXT:    retq
   4698   %a.val = load double, double* %a
   4699   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
   4700   %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
   4701 
   4702   %b.val = load double, double* %b
   4703   %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
   4704   %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
   4705   %1 = extractelement <2 x double> %av, i64 0
   4706   %2 = extractelement <2 x double> %bv, i64 0
   4707   %3 = extractelement <2 x double> %av, i64 0
   4708   %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
   4709   %5 = bitcast i8 %c to <8 x i1>
   4710   %6 = extractelement <8 x i1> %5, i64 0
   4711   %7 = select i1 %6, double %4, double 0.000000e+00
   4712   %8 = insertelement <2 x double> %av, double %7, i64 0
   4713   %sr = extractelement <2 x double> %8, i32 0
   4714   store double %sr, double* %a
   4715   ret void
   4716 }
   4717 
   4718 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
   4719 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
   4720 ; CHECK:       ## %bb.0:
   4721 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
   4722 ; CHECK-NEXT:    vfmsub231sd {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
   4723 ; CHECK-NEXT:    kmovw %edi, %k1
   4724 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
   4725 ; CHECK-NEXT:    vfmsub231sd {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
   4726 ; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
   4727 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
   4728 ; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
   4729 ; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   4730 ; CHECK-NEXT:    vaddpd %xmm2, %xmm4, %xmm0
   4731 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   4732 ; CHECK-NEXT:    retq
   4733   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
   4734   %2 = extractelement <2 x double> %x0, i64 0
   4735   %3 = extractelement <2 x double> %x1, i64 0
   4736   %4 = extractelement <2 x double> %1, i64 0
   4737   %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
   4738   %6 = extractelement <2 x double> %x2, i64 0
   4739   %7 = insertelement <2 x double> %x2, double %5, i64 0
   4740   %8 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
   4741   %9 = extractelement <2 x double> %x0, i64 0
   4742   %10 = extractelement <2 x double> %x1, i64 0
   4743   %11 = extractelement <2 x double> %8, i64 0
   4744   %12 = call double @llvm.fma.f64(double %9, double %10, double %11)
   4745   %13 = extractelement <2 x double> %x2, i64 0
   4746   %14 = bitcast i8 %x3 to <8 x i1>
   4747   %15 = extractelement <8 x i1> %14, i64 0
   4748   %16 = select i1 %15, double %12, double %13
   4749   %17 = insertelement <2 x double> %x2, double %16, i64 0
   4750   %18 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
   4751   %19 = extractelement <2 x double> %x0, i64 0
   4752   %20 = extractelement <2 x double> %x1, i64 0
   4753   %21 = extractelement <2 x double> %18, i64 0
   4754   %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 3)
   4755   %23 = extractelement <2 x double> %x2, i64 0
   4756   %24 = insertelement <2 x double> %x2, double %22, i64 0
   4757   %25 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
   4758   %26 = extractelement <2 x double> %x0, i64 0
   4759   %27 = extractelement <2 x double> %x1, i64 0
   4760   %28 = extractelement <2 x double> %25, i64 0
   4761   %29 = call double @llvm.x86.avx512.vfmadd.f64(double %26, double %27, double %28, i32 3)
   4762   %30 = extractelement <2 x double> %x2, i64 0
   4763   %31 = bitcast i8 %x3 to <8 x i1>
   4764   %32 = extractelement <8 x i1> %31, i64 0
   4765   %33 = select i1 %32, double %29, double %30
   4766   %34 = insertelement <2 x double> %x2, double %33, i64 0
   4767   %res4 = fadd <2 x double> %7, %17
   4768   %res5 = fadd <2 x double> %24, %34
   4769   %res6 = fadd <2 x double> %res4, %res5
   4770   ret <2 x double> %res6
   4771 }
   4772 
   4773 define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
   4774 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
   4775 ; CHECK:       ## %bb.0:
   4776 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
   4777 ; CHECK-NEXT:    vfmsub231ss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
   4778 ; CHECK-NEXT:    kmovw %edi, %k1
   4779 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
   4780 ; CHECK-NEXT:    vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
   4781 ; CHECK-NEXT:    vaddps %xmm4, %xmm3, %xmm3
   4782 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
   4783 ; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
   4784 ; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   4785 ; CHECK-NEXT:    vaddps %xmm2, %xmm4, %xmm0
   4786 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   4787 ; CHECK-NEXT:    retq
   4788   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
   4789   %2 = extractelement <4 x float> %x0, i64 0
   4790   %3 = extractelement <4 x float> %x1, i64 0
   4791   %4 = extractelement <4 x float> %1, i64 0
   4792   %5 = call float @llvm.fma.f32(float %2, float %3, float %4)
   4793   %6 = extractelement <4 x float> %x2, i64 0
   4794   %7 = insertelement <4 x float> %x2, float %5, i64 0
   4795   %8 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
   4796   %9 = extractelement <4 x float> %x0, i64 0
   4797   %10 = extractelement <4 x float> %x1, i64 0
   4798   %11 = extractelement <4 x float> %8, i64 0
   4799   %12 = call float @llvm.fma.f32(float %9, float %10, float %11)
   4800   %13 = extractelement <4 x float> %x2, i64 0
   4801   %14 = bitcast i8 %x3 to <8 x i1>
   4802   %15 = extractelement <8 x i1> %14, i64 0
   4803   %16 = select i1 %15, float %12, float %13
   4804   %17 = insertelement <4 x float> %x2, float %16, i64 0
   4805   %18 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
   4806   %19 = extractelement <4 x float> %x0, i64 0
   4807   %20 = extractelement <4 x float> %x1, i64 0
   4808   %21 = extractelement <4 x float> %18, i64 0
   4809   %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 3)
   4810   %23 = extractelement <4 x float> %x2, i64 0
   4811   %24 = insertelement <4 x float> %x2, float %22, i64 0
   4812   %25 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
   4813   %26 = extractelement <4 x float> %x0, i64 0
   4814   %27 = extractelement <4 x float> %x1, i64 0
   4815   %28 = extractelement <4 x float> %25, i64 0
   4816   %29 = call float @llvm.x86.avx512.vfmadd.f32(float %26, float %27, float %28, i32 3)
   4817   %30 = extractelement <4 x float> %x2, i64 0
   4818   %31 = bitcast i8 %x3 to <8 x i1>
   4819   %32 = extractelement <8 x i1> %31, i64 0
   4820   %33 = select i1 %32, float %29, float %30
   4821   %34 = insertelement <4 x float> %x2, float %33, i64 0
   4822   %res4 = fadd <4 x float> %7, %17
   4823   %res5 = fadd <4 x float> %24, %34
   4824   %res6 = fadd <4 x float> %res4, %res5
   4825   ret <4 x float> %res6
   4826 }
   4827 
   4828 define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
   4829 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
   4830 ; CHECK:       ## %bb.0:
   4831 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
   4832 ; CHECK-NEXT:    vfnmsub231sd {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
   4833 ; CHECK-NEXT:    kmovw %edi, %k1
   4834 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
   4835 ; CHECK-NEXT:    vfnmsub231sd {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
   4836 ; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
   4837 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
   4838 ; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
   4839 ; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   4840 ; CHECK-NEXT:    vaddpd %xmm2, %xmm4, %xmm0
   4841 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   4842 ; CHECK-NEXT:    retq
   4843   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
   4844   %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
   4845   %3 = extractelement <2 x double> %1, i64 0
   4846   %4 = extractelement <2 x double> %x1, i64 0
   4847   %5 = extractelement <2 x double> %2, i64 0
   4848   %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
   4849   %7 = extractelement <2 x double> %x2, i64 0
   4850   %8 = insertelement <2 x double> %x2, double %6, i64 0
   4851   %9 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
   4852   %10 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
   4853   %11 = extractelement <2 x double> %9, i64 0
   4854   %12 = extractelement <2 x double> %x1, i64 0
   4855   %13 = extractelement <2 x double> %10, i64 0
   4856   %14 = call double @llvm.fma.f64(double %11, double %12, double %13)
   4857   %15 = extractelement <2 x double> %x2, i64 0
   4858   %16 = bitcast i8 %x3 to <8 x i1>
   4859   %17 = extractelement <8 x i1> %16, i64 0
   4860   %18 = select i1 %17, double %14, double %15
   4861   %19 = insertelement <2 x double> %x2, double %18, i64 0
   4862   %20 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
   4863   %21 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
   4864   %22 = extractelement <2 x double> %20, i64 0
   4865   %23 = extractelement <2 x double> %x1, i64 0
   4866   %24 = extractelement <2 x double> %21, i64 0
   4867   %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 3)
   4868   %26 = extractelement <2 x double> %x2, i64 0
   4869   %27 = insertelement <2 x double> %x2, double %25, i64 0
   4870   %28 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
   4871   %29 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
   4872   %30 = extractelement <2 x double> %28, i64 0
   4873   %31 = extractelement <2 x double> %x1, i64 0
   4874   %32 = extractelement <2 x double> %29, i64 0
   4875   %33 = call double @llvm.x86.avx512.vfmadd.f64(double %30, double %31, double %32, i32 3)
   4876   %34 = extractelement <2 x double> %x2, i64 0
   4877   %35 = bitcast i8 %x3 to <8 x i1>
   4878   %36 = extractelement <8 x i1> %35, i64 0
   4879   %37 = select i1 %36, double %33, double %34
   4880   %38 = insertelement <2 x double> %x2, double %37, i64 0
   4881   %res4 = fadd <2 x double> %8, %19
   4882   %res5 = fadd <2 x double> %27, %38
   4883   %res6 = fadd <2 x double> %res4, %res5
   4884   ret <2 x double> %res6
   4885 }
   4886 
   4887 define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
   4888 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
   4889 ; CHECK:       ## %bb.0:
   4890 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
   4891 ; CHECK-NEXT:    vfnmsub231ss {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
   4892 ; CHECK-NEXT:    kmovw %edi, %k1
   4893 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
   4894 ; CHECK-NEXT:    vfnmsub231ss {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
   4895 ; CHECK-NEXT:    vaddps %xmm4, %xmm3, %xmm3
   4896 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
   4897 ; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
   4898 ; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   4899 ; CHECK-NEXT:    vaddps %xmm2, %xmm4, %xmm0
   4900 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   4901 ; CHECK-NEXT:    retq
   4902   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
   4903   %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
   4904   %3 = extractelement <4 x float> %1, i64 0
   4905   %4 = extractelement <4 x float> %x1, i64 0
   4906   %5 = extractelement <4 x float> %2, i64 0
   4907   %6 = call float @llvm.fma.f32(float %3, float %4, float %5)
   4908   %7 = extractelement <4 x float> %x2, i64 0
   4909   %8 = insertelement <4 x float> %x2, float %6, i64 0
   4910   %9 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
   4911   %10 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
   4912   %11 = extractelement <4 x float> %9, i64 0
   4913   %12 = extractelement <4 x float> %x1, i64 0
   4914   %13 = extractelement <4 x float> %10, i64 0
   4915   %14 = call float @llvm.fma.f32(float %11, float %12, float %13)
   4916   %15 = extractelement <4 x float> %x2, i64 0
   4917   %16 = bitcast i8 %x3 to <8 x i1>
   4918   %17 = extractelement <8 x i1> %16, i64 0
   4919   %18 = select i1 %17, float %14, float %15
   4920   %19 = insertelement <4 x float> %x2, float %18, i64 0
   4921   %20 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
   4922   %21 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
   4923   %22 = extractelement <4 x float> %20, i64 0
   4924   %23 = extractelement <4 x float> %x1, i64 0
   4925   %24 = extractelement <4 x float> %21, i64 0
   4926   %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 3)
   4927   %26 = extractelement <4 x float> %x2, i64 0
   4928   %27 = insertelement <4 x float> %x2, float %25, i64 0
   4929   %28 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
   4930   %29 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
   4931   %30 = extractelement <4 x float> %28, i64 0
   4932   %31 = extractelement <4 x float> %x1, i64 0
   4933   %32 = extractelement <4 x float> %29, i64 0
   4934   %33 = call float @llvm.x86.avx512.vfmadd.f32(float %30, float %31, float %32, i32 3)
   4935   %34 = extractelement <4 x float> %x2, i64 0
   4936   %35 = bitcast i8 %x3 to <8 x i1>
   4937   %36 = extractelement <8 x i1> %35, i64 0
   4938   %37 = select i1 %36, float %33, float %34
   4939   %38 = insertelement <4 x float> %x2, float %37, i64 0
   4940   %res4 = fadd <4 x float> %8, %19
   4941   %res5 = fadd <4 x float> %27, %38
   4942   %res6 = fadd <4 x float> %res4, %res5
   4943   ret <4 x float> %res6
   4944 }
   4945 
   4946 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
   4947 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
   4948 ; CHECK:       ## %bb.0:
   4949 ; CHECK-NEXT:    kmovw %esi, %k1
   4950 ; CHECK-NEXT:    vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
   4951 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
   4952 ; CHECK-NEXT:    retq
   4953   %q = load float, float* %ptr_b
   4954   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
   4955   %1 = extractelement <4 x float> %x0, i64 0
   4956   %2 = extractelement <4 x float> %vecinit.i, i64 0
   4957   %3 = extractelement <4 x float> %x1, i64 0
   4958   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
   4959   %5 = bitcast i8 %x3 to <8 x i1>
   4960   %6 = extractelement <8 x i1> %5, i64 0
   4961   %7 = select i1 %6, float %4, float %3
   4962   %8 = insertelement <4 x float> %x1, float %7, i64 0
   4963   ret <4 x float> %8
   4964 }
   4965 
   4966 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
   4967 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
   4968 ; CHECK:       ## %bb.0:
   4969 ; CHECK-NEXT:    kmovw %esi, %k1
   4970 ; CHECK-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
   4971 ; CHECK-NEXT:    retq
   4972   %q = load float, float* %ptr_b
   4973   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
   4974   %1 = extractelement <4 x float> %x0, i64 0
   4975   %2 = extractelement <4 x float> %vecinit.i, i64 0
   4976   %3 = extractelement <4 x float> %x1, i64 0
   4977   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
   4978   %5 = bitcast i8 %x3 to <8 x i1>
   4979   %6 = extractelement <8 x i1> %5, i64 0
   4980   %7 = select i1 %6, float %4, float %1
   4981   %8 = insertelement <4 x float> %x0, float %7, i64 0
   4982   ret <4 x float> %8
   4983 }
   4984 
   4985 
   4986 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
   4987 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
   4988 ; CHECK:       ## %bb.0:
   4989 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   4990 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   4991 ; CHECK-NEXT:    retq
   4992   %q = load float, float* %ptr_b
   4993   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
   4994   %1 = extractelement <4 x float> %x0, i64 0
   4995   %2 = extractelement <4 x float> %x1, i64 0
   4996   %3 = extractelement <4 x float> %vecinit.i, i64 0
   4997   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
   4998   %5 = select i1 false, float %4, float 0.000000e+00
   4999   %6 = insertelement <4 x float> %x0, float %5, i64 0
   5000   ret <4 x float> %6
   5001 }
   5002 
   5003 define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) {
   5004 ; CHECK-LABEL: test_x86_avx512_psll_d_512:
   5005 ; CHECK:       ## %bb.0:
   5006 ; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0
   5007 ; CHECK-NEXT:    retq
   5008   %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5009   ret <16 x i32> %res
   5010 }
   5011 define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
   5012 ; CHECK-LABEL: test_x86_avx512_mask_psll_d_512:
   5013 ; CHECK:       ## %bb.0:
   5014 ; CHECK-NEXT:    kmovw %edi, %k1
   5015 ; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm2 {%k1}
   5016 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5017 ; CHECK-NEXT:    retq
   5018   %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5019   %mask.cast = bitcast i16 %mask to <16 x i1>
   5020   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
   5021   ret <16 x i32> %res2
   5022 }
   5023 define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
   5024 ; CHECK-LABEL: test_x86_avx512_maskz_psll_d_512:
   5025 ; CHECK:       ## %bb.0:
   5026 ; CHECK-NEXT:    kmovw %edi, %k1
   5027 ; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
   5028 ; CHECK-NEXT:    retq
   5029   %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5030   %mask.cast = bitcast i16 %mask to <16 x i1>
   5031   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5032   ret <16 x i32> %res2
   5033 }
   5034 declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
   5035 
   5036 
   5037 define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) {
   5038 ; CHECK-LABEL: test_x86_avx512_psll_q_512:
   5039 ; CHECK:       ## %bb.0:
   5040 ; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
   5041 ; CHECK-NEXT:    retq
   5042   %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5043   ret <8 x i64> %res
   5044 }
   5045 define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
   5046 ; CHECK-LABEL: test_x86_avx512_mask_psll_q_512:
   5047 ; CHECK:       ## %bb.0:
   5048 ; CHECK-NEXT:    kmovw %edi, %k1
   5049 ; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1}
   5050 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5051 ; CHECK-NEXT:    retq
   5052   %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5053   %mask.cast = bitcast i8 %mask to <8 x i1>
   5054   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
   5055   ret <8 x i64> %res2
   5056 }
   5057 define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
   5058 ; CHECK-LABEL: test_x86_avx512_maskz_psll_q_512:
   5059 ; CHECK:       ## %bb.0:
   5060 ; CHECK-NEXT:    kmovw %edi, %k1
   5061 ; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
   5062 ; CHECK-NEXT:    retq
   5063   %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5064   %mask.cast = bitcast i8 %mask to <8 x i1>
   5065   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5066   ret <8 x i64> %res2
   5067 }
   5068 declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
   5069 
   5070 
   5071 define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) {
   5072 ; CHECK-LABEL: test_x86_avx512_pslli_d_512:
   5073 ; CHECK:       ## %bb.0:
   5074 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0
   5075 ; CHECK-NEXT:    retq
   5076   %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5077   ret <16 x i32> %res
   5078 }
   5079 define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
   5080 ; CHECK-LABEL: test_x86_avx512_mask_pslli_d_512:
   5081 ; CHECK:       ## %bb.0:
   5082 ; CHECK-NEXT:    kmovw %edi, %k1
   5083 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1}
   5084 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   5085 ; CHECK-NEXT:    retq
   5086   %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5087   %mask.cast = bitcast i16 %mask to <16 x i1>
   5088   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
   5089   ret <16 x i32> %res2
   5090 }
   5091 define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) {
   5092 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d_512:
   5093 ; CHECK:       ## %bb.0:
   5094 ; CHECK-NEXT:    kmovw %edi, %k1
   5095 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z}
   5096 ; CHECK-NEXT:    retq
   5097   %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5098   %mask.cast = bitcast i16 %mask to <16 x i1>
   5099   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5100   ret <16 x i32> %res2
   5101 }
   5102 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
   5103 
   5104 
   5105 define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) {
   5106 ; CHECK-LABEL: test_x86_avx512_pslli_q_512:
   5107 ; CHECK:       ## %bb.0:
   5108 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0
   5109 ; CHECK-NEXT:    retq
   5110   %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5111   ret <8 x i64> %res
   5112 }
   5113 define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
   5114 ; CHECK-LABEL: test_x86_avx512_mask_pslli_q_512:
   5115 ; CHECK:       ## %bb.0:
   5116 ; CHECK-NEXT:    kmovw %edi, %k1
   5117 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1}
   5118 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   5119 ; CHECK-NEXT:    retq
   5120   %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5121   %mask.cast = bitcast i8 %mask to <8 x i1>
   5122   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
   5123   ret <8 x i64> %res2
   5124 }
   5125 define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
   5126 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q_512:
   5127 ; CHECK:       ## %bb.0:
   5128 ; CHECK-NEXT:    kmovw %edi, %k1
   5129 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z}
   5130 ; CHECK-NEXT:    retq
   5131   %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5132   %mask.cast = bitcast i8 %mask to <8 x i1>
   5133   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5134   ret <8 x i64> %res2
   5135 }
   5136 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
   5137 
   5138 
   5139 define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) {
   5140 ; CHECK-LABEL: test_x86_avx512_psra_q_512:
   5141 ; CHECK:       ## %bb.0:
   5142 ; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
   5143 ; CHECK-NEXT:    retq
   5144   %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5145   ret <8 x i64> %res
   5146 }
   5147 define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
   5148 ; CHECK-LABEL: test_x86_avx512_mask_psra_q_512:
   5149 ; CHECK:       ## %bb.0:
   5150 ; CHECK-NEXT:    kmovw %edi, %k1
   5151 ; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1}
   5152 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5153 ; CHECK-NEXT:    retq
   5154   %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5155   %mask.cast = bitcast i8 %mask to <8 x i1>
   5156   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
   5157   ret <8 x i64> %res2
   5158 }
   5159 define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
   5160 ; CHECK-LABEL: test_x86_avx512_maskz_psra_q_512:
   5161 ; CHECK:       ## %bb.0:
   5162 ; CHECK-NEXT:    kmovw %edi, %k1
   5163 ; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
   5164 ; CHECK-NEXT:    retq
   5165   %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5166   %mask.cast = bitcast i8 %mask to <8 x i1>
   5167   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5168   ret <8 x i64> %res2
   5169 }
   5170 declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
   5171 
   5172 
   5173 define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) {
   5174 ; CHECK-LABEL: test_x86_avx512_psra_d_512:
   5175 ; CHECK:       ## %bb.0:
   5176 ; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
   5177 ; CHECK-NEXT:    retq
   5178   %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5179   ret <16 x i32> %res
   5180 }
   5181 define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
   5182 ; CHECK-LABEL: test_x86_avx512_mask_psra_d_512:
   5183 ; CHECK:       ## %bb.0:
   5184 ; CHECK-NEXT:    kmovw %edi, %k1
   5185 ; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm2 {%k1}
   5186 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5187 ; CHECK-NEXT:    retq
   5188   %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5189   %mask.cast = bitcast i16 %mask to <16 x i1>
   5190   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
   5191   ret <16 x i32> %res2
   5192 }
   5193 define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
   5194 ; CHECK-LABEL: test_x86_avx512_maskz_psra_d_512:
   5195 ; CHECK:       ## %bb.0:
   5196 ; CHECK-NEXT:    kmovw %edi, %k1
   5197 ; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
   5198 ; CHECK-NEXT:    retq
   5199   %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5200   %mask.cast = bitcast i16 %mask to <16 x i1>
   5201   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5202   ret <16 x i32> %res2
   5203 }
   5204 declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
   5205 
   5206 
   5207 
   5208 define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) {
   5209 ; CHECK-LABEL: test_x86_avx512_psrai_q_512:
   5210 ; CHECK:       ## %bb.0:
   5211 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0
   5212 ; CHECK-NEXT:    retq
   5213   %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5214   ret <8 x i64> %res
   5215 }
   5216 define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
   5217 ; CHECK-LABEL: test_x86_avx512_mask_psrai_q_512:
   5218 ; CHECK:       ## %bb.0:
   5219 ; CHECK-NEXT:    kmovw %edi, %k1
   5220 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1}
   5221 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   5222 ; CHECK-NEXT:    retq
   5223   %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5224   %mask.cast = bitcast i8 %mask to <8 x i1>
   5225   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
   5226   ret <8 x i64> %res2
   5227 }
   5228 define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) {
   5229 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q_512:
   5230 ; CHECK:       ## %bb.0:
   5231 ; CHECK-NEXT:    kmovw %edi, %k1
   5232 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z}
   5233 ; CHECK-NEXT:    retq
   5234   %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5235   %mask.cast = bitcast i8 %mask to <8 x i1>
   5236   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5237   ret <8 x i64> %res2
   5238 }
   5239 declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
   5240 
   5241 
   5242 define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) {
   5243 ; CHECK-LABEL: test_x86_avx512_psrai_d_512:
   5244 ; CHECK:       ## %bb.0:
   5245 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0
   5246 ; CHECK-NEXT:    retq
   5247   %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5248   ret <16 x i32> %res
   5249 }
   5250 define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
   5251 ; CHECK-LABEL: test_x86_avx512_mask_psrai_d_512:
   5252 ; CHECK:       ## %bb.0:
   5253 ; CHECK-NEXT:    kmovw %edi, %k1
   5254 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1}
   5255 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   5256 ; CHECK-NEXT:    retq
   5257   %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5258   %mask.cast = bitcast i16 %mask to <16 x i1>
   5259   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
   5260   ret <16 x i32> %res2
   5261 }
   5262 define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) {
   5263 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d_512:
   5264 ; CHECK:       ## %bb.0:
   5265 ; CHECK-NEXT:    kmovw %edi, %k1
   5266 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z}
   5267 ; CHECK-NEXT:    retq
   5268   %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5269   %mask.cast = bitcast i16 %mask to <16 x i1>
   5270   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5271   ret <16 x i32> %res2
   5272 }
   5273 declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
   5274 
   5275 
   5276 
   5277 define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) {
   5278 ; CHECK-LABEL: test_x86_avx512_psrl_d_512:
   5279 ; CHECK:       ## %bb.0:
   5280 ; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
   5281 ; CHECK-NEXT:    retq
   5282   %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5283   ret <16 x i32> %res
   5284 }
   5285 define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
   5286 ; CHECK-LABEL: test_x86_avx512_mask_psrl_d_512:
   5287 ; CHECK:       ## %bb.0:
   5288 ; CHECK-NEXT:    kmovw %edi, %k1
   5289 ; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm2 {%k1}
   5290 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5291 ; CHECK-NEXT:    retq
   5292   %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5293   %mask.cast = bitcast i16 %mask to <16 x i1>
   5294   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
   5295   ret <16 x i32> %res2
   5296 }
   5297 define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
   5298 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_d_512:
   5299 ; CHECK:       ## %bb.0:
   5300 ; CHECK-NEXT:    kmovw %edi, %k1
   5301 ; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
   5302 ; CHECK-NEXT:    retq
   5303   %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
   5304   %mask.cast = bitcast i16 %mask to <16 x i1>
   5305   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5306   ret <16 x i32> %res2
   5307 }
   5308 declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
   5309 
   5310 
   5311 define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) {
   5312 ; CHECK-LABEL: test_x86_avx512_psrl_q_512:
   5313 ; CHECK:       ## %bb.0:
   5314 ; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
   5315 ; CHECK-NEXT:    retq
   5316   %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5317   ret <8 x i64> %res
   5318 }
   5319 define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
   5320 ; CHECK-LABEL: test_x86_avx512_mask_psrl_q_512:
   5321 ; CHECK:       ## %bb.0:
   5322 ; CHECK-NEXT:    kmovw %edi, %k1
   5323 ; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
   5324 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5325 ; CHECK-NEXT:    retq
   5326   %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5327   %mask.cast = bitcast i8 %mask to <8 x i1>
   5328   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
   5329   ret <8 x i64> %res2
   5330 }
   5331 define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
   5332 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q_512:
   5333 ; CHECK:       ## %bb.0:
   5334 ; CHECK-NEXT:    kmovw %edi, %k1
   5335 ; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
   5336 ; CHECK-NEXT:    retq
   5337   %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
   5338   %mask.cast = bitcast i8 %mask to <8 x i1>
   5339   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5340   ret <8 x i64> %res2
   5341 }
   5342 declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
   5343 
   5344 
   5345 define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) {
   5346 ; CHECK-LABEL: test_x86_avx512_psrli_d_512:
   5347 ; CHECK:       ## %bb.0:
   5348 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0
   5349 ; CHECK-NEXT:    retq
   5350   %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5351   ret <16 x i32> %res
   5352 }
   5353 define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
   5354 ; CHECK-LABEL: test_x86_avx512_mask_psrli_d_512:
   5355 ; CHECK:       ## %bb.0:
   5356 ; CHECK-NEXT:    kmovw %edi, %k1
   5357 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1}
   5358 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   5359 ; CHECK-NEXT:    retq
   5360   %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5361   %mask.cast = bitcast i16 %mask to <16 x i1>
   5362   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
   5363   ret <16 x i32> %res2
   5364 }
   5365 define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) {
   5366 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d_512:
   5367 ; CHECK:       ## %bb.0:
   5368 ; CHECK-NEXT:    kmovw %edi, %k1
   5369 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z}
   5370 ; CHECK-NEXT:    retq
   5371   %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
   5372   %mask.cast = bitcast i16 %mask to <16 x i1>
   5373   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5374   ret <16 x i32> %res2
   5375 }
   5376 declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
   5377 
   5378 
   5379 define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) {
   5380 ; CHECK-LABEL: test_x86_avx512_psrli_q_512:
   5381 ; CHECK:       ## %bb.0:
   5382 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0
   5383 ; CHECK-NEXT:    retq
   5384   %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5385   ret <8 x i64> %res
   5386 }
   5387 define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
   5388 ; CHECK-LABEL: test_x86_avx512_mask_psrli_q_512:
   5389 ; CHECK:       ## %bb.0:
   5390 ; CHECK-NEXT:    kmovw %edi, %k1
   5391 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1}
   5392 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   5393 ; CHECK-NEXT:    retq
   5394   %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5395   %mask.cast = bitcast i8 %mask to <8 x i1>
   5396   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
   5397   ret <8 x i64> %res2
   5398 }
   5399 define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) {
   5400 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q_512:
   5401 ; CHECK:       ## %bb.0:
   5402 ; CHECK-NEXT:    kmovw %edi, %k1
   5403 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
   5404 ; CHECK-NEXT:    retq
   5405   %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
   5406   %mask.cast = bitcast i8 %mask to <8 x i1>
   5407   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5408   ret <8 x i64> %res2
   5409 }
   5410 declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
   5411 
   5412 define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
   5413 ; CHECK-LABEL: test_x86_avx512_psllv_d_512:
   5414 ; CHECK:       ## %bb.0:
   5415 ; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
   5416 ; CHECK-NEXT:    retq
   5417   %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5418   ret <16 x i32> %res
   5419 }
   5420 
   5421 define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   5422 ; CHECK-LABEL: test_x86_avx512_mask_psllv_d_512:
   5423 ; CHECK:       ## %bb.0:
   5424 ; CHECK-NEXT:    kmovw %edi, %k1
   5425 ; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
   5426 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5427 ; CHECK-NEXT:    retq
   5428   %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5429   %mask.cast = bitcast i16 %mask to <16 x i1>
   5430   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
   5431   ret <16 x i32> %res2
   5432 }
   5433 
   5434 define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   5435 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_d_512:
   5436 ; CHECK:       ## %bb.0:
   5437 ; CHECK-NEXT:    kmovw %edi, %k1
   5438 ; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   5439 ; CHECK-NEXT:    retq
   5440   %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5441   %mask.cast = bitcast i16 %mask to <16 x i1>
   5442   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5443   ret <16 x i32> %res2
   5444 }
   5445 
   5446 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
   5447 
   5448 define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
   5449 ; CHECK-LABEL: test_x86_avx512_psllv_q_512:
   5450 ; CHECK:       ## %bb.0:
   5451 ; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
   5452 ; CHECK-NEXT:    retq
   5453   %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5454   ret <8 x i64> %res
   5455 }
   5456 
   5457 define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   5458 ; CHECK-LABEL: test_x86_avx512_mask_psllv_q_512:
   5459 ; CHECK:       ## %bb.0:
   5460 ; CHECK-NEXT:    kmovw %edi, %k1
   5461 ; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
   5462 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5463 ; CHECK-NEXT:    retq
   5464   %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5465   %mask.cast = bitcast i8 %mask to <8 x i1>
   5466   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
   5467   ret <8 x i64> %res2
   5468 }
   5469 
   5470 define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   5471 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q_512:
   5472 ; CHECK:       ## %bb.0:
   5473 ; CHECK-NEXT:    kmovw %edi, %k1
   5474 ; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   5475 ; CHECK-NEXT:    retq
   5476   %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5477   %mask.cast = bitcast i8 %mask to <8 x i1>
   5478   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5479   ret <8 x i64> %res2
   5480 }
   5481 
   5482 declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
   5483 
   5484 define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) {
   5485 ; CHECK-LABEL: test_x86_avx512_psrav_d_512:
   5486 ; CHECK:       ## %bb.0:
   5487 ; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
   5488 ; CHECK-NEXT:    retq
   5489   %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5490   ret <16 x i32> %res
   5491 }
   5492 
   5493 define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   5494 ; CHECK-LABEL: test_x86_avx512_mask_psrav_d_512:
   5495 ; CHECK:       ## %bb.0:
   5496 ; CHECK-NEXT:    kmovw %edi, %k1
   5497 ; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm2 {%k1}
   5498 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5499 ; CHECK-NEXT:    retq
   5500   %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5501   %mask.cast = bitcast i16 %mask to <16 x i1>
   5502   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
   5503   ret <16 x i32> %res2
   5504 }
   5505 
   5506 define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   5507 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_d_512:
   5508 ; CHECK:       ## %bb.0:
   5509 ; CHECK-NEXT:    kmovw %edi, %k1
   5510 ; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
   5511 ; CHECK-NEXT:    retq
   5512   %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5513   %mask.cast = bitcast i16 %mask to <16 x i1>
   5514   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5515   ret <16 x i32> %res2
   5516 }
   5517 
   5518 declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
   5519 
   5520 define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) {
   5521 ; CHECK-LABEL: test_x86_avx512_psrav_q_512:
   5522 ; CHECK:       ## %bb.0:
   5523 ; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
   5524 ; CHECK-NEXT:    retq
   5525   %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5526   ret <8 x i64> %res
   5527 }
   5528 
   5529 define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   5530 ; CHECK-LABEL: test_x86_avx512_mask_psrav_q_512:
   5531 ; CHECK:       ## %bb.0:
   5532 ; CHECK-NEXT:    kmovw %edi, %k1
   5533 ; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1}
   5534 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5535 ; CHECK-NEXT:    retq
   5536   %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5537   %mask.cast = bitcast i8 %mask to <8 x i1>
   5538   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
   5539   ret <8 x i64> %res2
   5540 }
   5541 
   5542 define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   5543 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q_512:
   5544 ; CHECK:       ## %bb.0:
   5545 ; CHECK-NEXT:    kmovw %edi, %k1
   5546 ; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
   5547 ; CHECK-NEXT:    retq
   5548   %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5549   %mask.cast = bitcast i8 %mask to <8 x i1>
   5550   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5551   ret <8 x i64> %res2
   5552 }
   5553 
   5554 declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
   5555 
   5556 define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
   5557 ; CHECK-LABEL: test_x86_avx512_psrlv_d_512:
   5558 ; CHECK:       ## %bb.0:
   5559 ; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
   5560 ; CHECK-NEXT:    retq
   5561   %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5562   ret <16 x i32> %res
   5563 }
   5564 
   5565 define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
   5566 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_d_512:
   5567 ; CHECK:       ## %bb.0:
   5568 ; CHECK-NEXT:    kmovw %edi, %k1
   5569 ; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
   5570 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5571 ; CHECK-NEXT:    retq
   5572   %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5573   %mask.cast = bitcast i16 %mask to <16 x i1>
   5574   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
   5575   ret <16 x i32> %res2
   5576 }
   5577 
   5578 define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
   5579 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d_512:
   5580 ; CHECK:       ## %bb.0:
   5581 ; CHECK-NEXT:    kmovw %edi, %k1
   5582 ; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   5583 ; CHECK-NEXT:    retq
   5584   %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
   5585   %mask.cast = bitcast i16 %mask to <16 x i1>
   5586   %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
   5587   ret <16 x i32> %res2
   5588 }
   5589 
   5590 declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
   5591 
   5592 define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
   5593 ; CHECK-LABEL: test_x86_avx512_psrlv_q_512:
   5594 ; CHECK:       ## %bb.0:
   5595 ; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
   5596 ; CHECK-NEXT:    retq
   5597   %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5598   ret <8 x i64> %res
   5599 }
   5600 
   5601 define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
   5602 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q_512:
   5603 ; CHECK:       ## %bb.0:
   5604 ; CHECK-NEXT:    kmovw %edi, %k1
   5605 ; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
   5606 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   5607 ; CHECK-NEXT:    retq
   5608   %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5609   %mask.cast = bitcast i8 %mask to <8 x i1>
   5610   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
   5611   ret <8 x i64> %res2
   5612 }
   5613 
   5614 define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
   5615 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q_512:
   5616 ; CHECK:       ## %bb.0:
   5617 ; CHECK-NEXT:    kmovw %edi, %k1
   5618 ; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   5619 ; CHECK-NEXT:    retq
   5620   %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
   5621   %mask.cast = bitcast i8 %mask to <8 x i1>
   5622   %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
   5623   ret <8 x i64> %res2
   5624 }
   5625 
   5626 declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
   5627 
   5628 define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
   5629 ; CHECK-LABEL: bad_mask_transition:
   5630 ; CHECK:       ## %bb.0: ## %entry
   5631 ; CHECK-NEXT:    vcmplt_oqpd %zmm1, %zmm0, %k0
   5632 ; CHECK-NEXT:    kmovw %k0, %eax
   5633 ; CHECK-NEXT:    vcmplt_oqpd %zmm3, %zmm2, %k0
   5634 ; CHECK-NEXT:    kmovw %k0, %ecx
   5635 ; CHECK-NEXT:    movzbl %al, %eax
   5636 ; CHECK-NEXT:    movzbl %cl, %ecx
   5637 ; CHECK-NEXT:    kmovw %eax, %k0
   5638 ; CHECK-NEXT:    kmovw %ecx, %k1
   5639 ; CHECK-NEXT:    kunpckbw %k0, %k1, %k1
   5640 ; CHECK-NEXT:    vblendmps %zmm5, %zmm4, %zmm0 {%k1}
   5641 ; CHECK-NEXT:    retq
   5642 entry:
   5643   %0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
   5644   %1 = bitcast <8 x i1> %0 to i8
   5645   %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i32 4)
   5646   %3 = bitcast <8 x i1> %2 to i8
   5647   %conv = zext i8 %1 to i16
   5648   %conv2 = zext i8 %3 to i16
   5649   %4 = bitcast i16 %conv to <16 x i1>
   5650   %5 = bitcast i16 %conv2 to <16 x i1>
   5651   %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   5652   %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   5653   %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   5654   %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e
   5655   ret <16 x float> %9
   5656 }
   5657 
   5658 define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
   5659 ; CHECK-LABEL: bad_mask_transition_2:
   5660 ; CHECK:       ## %bb.0: ## %entry
   5661 ; CHECK-NEXT:    vcmplt_oqpd %zmm1, %zmm0, %k0
   5662 ; CHECK-NEXT:    kmovw %k0, %eax
   5663 ; CHECK-NEXT:    movzbl %al, %eax
   5664 ; CHECK-NEXT:    kmovw %eax, %k1
   5665 ; CHECK-NEXT:    vblendmps %zmm5, %zmm4, %zmm0 {%k1}
   5666 ; CHECK-NEXT:    retq
   5667 entry:
   5668   %0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
   5669   %1 = bitcast <8 x i1> %0 to i8
   5670   %conv = zext i8 %1 to i16
   5671   %2 = bitcast i16 %conv to <16 x i1>
   5672   %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e
   5673   ret <16 x float> %3
   5674 }
   5675