Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
      2 
      3 ; 256-bit
      4 
      5 define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
      6 ; CHECK-LABEL: test_pcmpeq_b_256
      7 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
      8   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
      9   ret i32 %res
     10 }
     11 
     12 define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
     13 ; CHECK-LABEL: test_mask_pcmpeq_b_256
     14 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
     15   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
     16   ret i32 %res
     17 }
     18 
     19 declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
     20 
     21 define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
     22 ; CHECK-LABEL: test_pcmpeq_w_256
     23 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
     24   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
     25   ret i16 %res
     26 }
     27 
     28 define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
     29 ; CHECK-LABEL: test_mask_pcmpeq_w_256
     30 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
     31   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
     32   ret i16 %res
     33 }
     34 
     35 declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
     36 
     37 define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
     38 ; CHECK-LABEL: test_pcmpgt_b_256
     39 ; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
     40   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
     41   ret i32 %res
     42 }
     43 
     44 define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
     45 ; CHECK-LABEL: test_mask_pcmpgt_b_256
     46 ; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
     47   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
     48   ret i32 %res
     49 }
     50 
     51 declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
     52 
     53 define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
     54 ; CHECK-LABEL: test_pcmpgt_w_256
     55 ; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
     56   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
     57   ret i16 %res
     58 }
     59 
     60 define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
     61 ; CHECK-LABEL: test_mask_pcmpgt_w_256
     62 ; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
     63   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
     64   ret i16 %res
     65 }
     66 
     67 declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
     68 
     69 define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
     70 ; CHECK_LABEL: test_cmp_b_256
     71 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
     72   %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
     73   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
     74 ; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
     75   %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
     76   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
     77 ; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
     78   %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
     79   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
     80 ; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
     81   %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
     82   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
     83 ; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
     84   %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
     85   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
     86 ; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
     87   %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
     88   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
     89 ; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
     90   %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
     91   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
     92 ; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
     93   %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
     94   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
     95   ret <8 x i32> %vec7
     96 }
     97 
     98 define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
     99 ; CHECK_LABEL: test_mask_cmp_b_256
    100 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
    101   %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
    102   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
    103 ; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
    104   %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
    105   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
    106 ; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
    107   %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
    108   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
    109 ; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
    110   %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
    111   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
    112 ; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
    113   %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
    114   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
    115 ; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
    116   %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
    117   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
    118 ; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
    119   %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
    120   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
    121 ; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
    122   %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
    123   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
    124   ret <8 x i32> %vec7
    125 }
    126 
    127 declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
    128 
    129 define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
    130 ; CHECK_LABEL: test_ucmp_b_256
    131 ; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
    132   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
    133   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
    134 ; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
    135   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
    136   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
    137 ; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
    138   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
    139   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
    140 ; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
    141   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
    142   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
    143 ; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
    144   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
    145   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
    146 ; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
    147   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
    148   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
    149 ; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
    150   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
    151   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
    152 ; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
    153   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
    154   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
    155   ret <8 x i32> %vec7
    156 }
    157 
    158 define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
    159 ; CHECK_LABEL: test_mask_ucmp_b_256
    160 ; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
    161   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
    162   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
    163 ; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
    164   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
    165   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
    166 ; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
    167   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
    168   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
    169 ; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
    170   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
    171   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
    172 ; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
    173   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
    174   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
    175 ; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
    176   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
    177   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
    178 ; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
    179   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
    180   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
    181 ; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
    182   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
    183   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
    184   ret <8 x i32> %vec7
    185 }
    186 
    187 declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
    188 
    189 define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
    190 ; CHECK_LABEL: test_cmp_w_256
    191 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
    192   %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
    193   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
    194 ; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
    195   %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
    196   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
    197 ; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
    198   %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
    199   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
    200 ; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
    201   %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
    202   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
    203 ; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
    204   %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
    205   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
    206 ; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
    207   %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
    208   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
    209 ; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
    210   %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
    211   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
    212 ; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
    213   %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
    214   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
    215   ret <8 x i16> %vec7
    216 }
    217 
    218 define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
    219 ; CHECK_LABEL: test_mask_cmp_w_256
    220 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
    221   %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
    222   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
    223 ; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
    224   %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
    225   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
    226 ; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
    227   %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
    228   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
    229 ; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
    230   %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
    231   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
    232 ; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
    233   %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
    234   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
    235 ; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
    236   %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
    237   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
    238 ; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
    239   %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
    240   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
    241 ; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
    242   %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
    243   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
    244   ret <8 x i16> %vec7
    245 }
    246 
    247 declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
    248 
    249 define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
    250 ; CHECK_LABEL: test_ucmp_w_256
    251 ; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
    252   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
    253   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
    254 ; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
    255   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
    256   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
    257 ; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
    258   %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
    259   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
    260 ; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
    261   %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
    262   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
    263 ; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
    264   %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
    265   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
    266 ; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
    267   %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
    268   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
    269 ; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
    270   %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
    271   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
    272 ; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
    273   %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
    274   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
    275   ret <8 x i16> %vec7
    276 }
    277 
    278 define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
    279 ; CHECK_LABEL: test_mask_ucmp_w_256
    280 ; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
    281   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
    282   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
    283 ; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
    284   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
    285   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
    286 ; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
    287   %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
    288   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
    289 ; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
    290   %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
    291   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
    292 ; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
    293   %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
    294   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
    295 ; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
    296   %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
    297   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
    298 ; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
    299   %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
    300   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
    301 ; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
    302   %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
    303   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
    304   ret <8 x i16> %vec7
    305 }
    306 
    307 declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
    308 
    309 ; 128-bit
    310 
    311 define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
    312 ; CHECK-LABEL: test_pcmpeq_b_128
    313 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
    314   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
    315   ret i16 %res
    316 }
    317 
    318 define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
    319 ; CHECK-LABEL: test_mask_pcmpeq_b_128
    320 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
    321   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
    322   ret i16 %res
    323 }
    324 
    325 declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
    326 
    327 define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
    328 ; CHECK-LABEL: test_pcmpeq_w_128
    329 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
    330   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
    331   ret i8 %res
    332 }
    333 
    334 define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
    335 ; CHECK-LABEL: test_mask_pcmpeq_w_128
    336 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
    337   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
    338   ret i8 %res
    339 }
    340 
    341 declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
    342 
    343 define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
    344 ; CHECK-LABEL: test_pcmpgt_b_128
    345 ; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
    346   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
    347   ret i16 %res
    348 }
    349 
    350 define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
    351 ; CHECK-LABEL: test_mask_pcmpgt_b_128
    352 ; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
    353   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
    354   ret i16 %res
    355 }
    356 
    357 declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
    358 
    359 define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
    360 ; CHECK-LABEL: test_pcmpgt_w_128
    361 ; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
    362   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
    363   ret i8 %res
    364 }
    365 
    366 define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
    367 ; CHECK-LABEL: test_mask_pcmpgt_w_128
    368 ; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
    369   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
    370   ret i8 %res
    371 }
    372 
    373 declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
    374 
    375 define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
    376 ; CHECK_LABEL: test_cmp_b_128
    377 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
    378   %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
    379   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
    380 ; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
    381   %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
    382   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
    383 ; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
    384   %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
    385   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
    386 ; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
    387   %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
    388   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
    389 ; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
    390   %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
    391   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
    392 ; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
    393   %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
    394   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
    395 ; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
    396   %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
    397   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
    398 ; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
    399   %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
    400   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
    401   ret <8 x i16> %vec7
    402 }
    403 
    404 define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
    405 ; CHECK_LABEL: test_mask_cmp_b_128
    406 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
    407   %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
    408   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
    409 ; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
    410   %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
    411   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
    412 ; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
    413   %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
    414   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
    415 ; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
    416   %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
    417   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
    418 ; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
    419   %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
    420   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
    421 ; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
    422   %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
    423   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
    424 ; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
    425   %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
    426   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
    427 ; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
    428   %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
    429   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
    430   ret <8 x i16> %vec7
    431 }
    432 
    433 declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
    434 
    435 define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
    436 ; CHECK_LABEL: test_ucmp_b_128
    437 ; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
    438   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
    439   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
    440 ; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
    441   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
    442   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
    443 ; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
    444   %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
    445   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
    446 ; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
    447   %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
    448   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
    449 ; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
    450   %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
    451   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
    452 ; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
    453   %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
    454   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
    455 ; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
    456   %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
    457   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
    458 ; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
    459   %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
    460   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
    461   ret <8 x i16> %vec7
    462 }
    463 
    464 define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
    465 ; CHECK_LABEL: test_mask_ucmp_b_128
    466 ; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
    467   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
    468   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
    469 ; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
    470   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
    471   %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
    472 ; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
    473   %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
    474   %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
    475 ; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
    476   %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
    477   %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
    478 ; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
    479   %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
    480   %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
    481 ; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
    482   %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
    483   %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
    484 ; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
    485   %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
    486   %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
    487 ; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
    488   %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
    489   %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
    490   ret <8 x i16> %vec7
    491 }
    492 
    493 declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
    494 
    495 define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
    496 ; CHECK_LABEL: test_cmp_w_128
    497 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
    498   %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
    499   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
    500 ; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
    501   %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
    502   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
    503 ; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
    504   %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
    505   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
    506 ; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
    507   %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
    508   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
    509 ; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
    510   %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
    511   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
    512 ; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
    513   %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
    514   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
    515 ; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
    516   %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
    517   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
    518 ; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
    519   %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
    520   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
    521   ret <8 x i8> %vec7
    522 }
    523 
    524 define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
    525 ; CHECK_LABEL: test_mask_cmp_w_128
    526 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
    527   %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
    528   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
    529 ; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
    530   %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
    531   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
    532 ; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
    533   %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
    534   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
    535 ; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
    536   %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
    537   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
    538 ; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
    539   %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
    540   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
    541 ; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
    542   %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
    543   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
    544 ; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
    545   %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
    546   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
    547 ; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
    548   %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
    549   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
    550   ret <8 x i8> %vec7
    551 }
    552 
    553 declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
    554 
    555 define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
    556 ; CHECK_LABEL: test_ucmp_w_128
    557 ; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
    558   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
    559   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
    560 ; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
    561   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
    562   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
    563 ; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
    564   %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
    565   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
    566 ; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
    567   %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
    568   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
    569 ; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
    570   %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
    571   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
    572 ; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
    573   %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
    574   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
    575 ; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
    576   %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
    577   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
    578 ; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
    579   %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
    580   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
    581   ret <8 x i8> %vec7
    582 }
    583 
    584 define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
    585 ; CHECK_LABEL: test_mask_ucmp_w_128
    586 ; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
    587   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
    588   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
    589 ; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
    590   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
    591   %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
    592 ; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
    593   %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
    594   %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
    595 ; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
    596   %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
    597   %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
    598 ; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
    599   %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
    600   %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
    601 ; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
    602   %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
    603   %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
    604 ; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
    605   %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
    606   %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
    607 ; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
    608   %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
    609   %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
    610   ret <8 x i8> %vec7
    611 }
    612 
    613 declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
    614 
    615 declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
    616 
    617 define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
    618   ; CHECK-LABEL: test_mask_vfmadd256_ps
    619   ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
    620   %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
    621   ret <8 x float> %res
    622 }
    623 
    624 declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
    625 
    626 define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
    627   ; CHECK-LABEL: test_mask_vfmadd128_ps
    628   ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
    629   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
    630   ret <4 x float> %res
    631 }
    632 
    633 declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
    634 
    635 define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
    636 ; CHECK-LABEL: test_mask_fmadd256_pd:
    637 ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
    638   %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
    639   ret <4 x double> %res
    640 }
    641 
    642 declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
    643 
    644 define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
    645 ; CHECK-LABEL: test_mask_fmadd128_pd:
    646 ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
    647   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
    648   ret <2 x double> %res
    649 }
    650 
    651 define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
    652 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
    653 ; CHECK:       ## BB#0:
    654 ; CHECK-NEXT:    movzbl %dil, %eax
    655 ; CHECK-NEXT:    kmovw %eax, %k1
    656 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
    657 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
    658 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
    659 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
    660 ; CHECK-NEXT:    retq
    661   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
    662   %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
    663   %res2 = fadd <2 x double> %res, %res1
    664   ret <2 x double> %res2
    665 }
    666 
    667 declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
    668 
    669 define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
    670 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
    671 ; CHECK:       ## BB#0:
    672 ; CHECK-NEXT:    movzbl %dil, %eax
    673 ; CHECK-NEXT:    kmovw %eax, %k1
    674 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    675 ; CHECK-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1}
    676 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
    677 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
    678 ; CHECK-NEXT:    retq
    679   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
    680   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
    681   %res2 = fadd <2 x double> %res, %res1
    682   ret <2 x double> %res2
    683 }
    684 
    685 declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
    686 
    687 define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
    688 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
    689 ; CHECK:       ## BB#0:
    690 ; CHECK-NEXT:    movzbl %dil, %eax
    691 ; CHECK-NEXT:    kmovw %eax, %k1
    692 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
    693 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
    694 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
    695 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
    696 ; CHECK-NEXT:    retq
    697   %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
    698   %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
    699   %res2 = fadd <2 x double> %res, %res1
    700   ret <2 x double> %res2
    701 }
    702 
    703 define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
    704 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
    705 ; CHECK:       ## BB#0:
    706 ; CHECK-NEXT:    movzbl %dil, %eax
    707 ; CHECK-NEXT:    kmovw %eax, %k1
    708 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
    709 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
    710 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
    711 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
    712 ; CHECK-NEXT:    retq
    713   %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
    714   %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
    715   %res2 = fadd <4 x double> %res, %res1
    716   ret <4 x double> %res2
    717 }
    718 
    719 declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
    720 
    721 define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
    722 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
    723 ; CHECK:       ## BB#0:
    724 ; CHECK-NEXT:    movzbl %dil, %eax
    725 ; CHECK-NEXT:    kmovw %eax, %k1
    726 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    727 ; CHECK-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1}
    728 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
    729 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
    730 ; CHECK-NEXT:    retq
    731   %res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
    732   %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
    733   %res2 = fadd <4 x double> %res, %res1
    734   ret <4 x double> %res2
    735 }
    736 
    737 declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
    738 
    739 define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
    740 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
    741 ; CHECK:       ## BB#0:
    742 ; CHECK-NEXT:    movzbl %dil, %eax
    743 ; CHECK-NEXT:    kmovw %eax, %k1
    744 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
    745 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
    746 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
    747 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
    748 ; CHECK-NEXT:    retq
    749   %res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
    750   %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
    751   %res2 = fadd <4 x double> %res, %res1
    752   ret <4 x double> %res2
    753 }
    754 
    755 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
    756 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
    757 ; CHECK:       ## BB#0:
    758 ; CHECK-NEXT:    movzbl %dil, %eax
    759 ; CHECK-NEXT:    kmovw %eax, %k1
    760 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
    761 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
    762 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
    763 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
    764 ; CHECK-NEXT:    retq
    765   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
    766   %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
    767   %res2 = fadd <4 x float> %res, %res1
    768   ret <4 x float> %res2
    769 }
    770 
    771 declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
    772 
    773 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
    774 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
    775 ; CHECK:       ## BB#0:
    776 ; CHECK-NEXT:    movzbl %dil, %eax
    777 ; CHECK-NEXT:    kmovw %eax, %k1
    778 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    779 ; CHECK-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1}
    780 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
    781 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
    782 ; CHECK-NEXT:    retq
    783   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
    784   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
    785   %res2 = fadd <4 x float> %res, %res1
    786   ret <4 x float> %res2
    787 }
    788 
    789 declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
    790 
    791 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
    792 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
    793 ; CHECK:       ## BB#0:
    794 ; CHECK-NEXT:    movzbl %dil, %eax
    795 ; CHECK-NEXT:    kmovw %eax, %k1
    796 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
    797 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
    798 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
    799 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
    800 ; CHECK-NEXT:    retq
    801   %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
    802   %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
    803   %res2 = fadd <4 x float> %res, %res1
    804   ret <4 x float> %res2
    805 }
    806 
    807 define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
    808 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
    809 ; CHECK:       ## BB#0:
    810 ; CHECK-NEXT:    movzbl %dil, %eax
    811 ; CHECK-NEXT:    kmovw %eax, %k1
    812 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
    813 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
    814 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
    815 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
    816 ; CHECK-NEXT:    retq
    817   %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
    818   %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
    819   %res2 = fadd <8 x float> %res, %res1
    820   ret <8 x float> %res2
    821 }
    822 
    823 declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
    824 
    825 define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
    826 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
    827 ; CHECK:       ## BB#0:
    828 ; CHECK-NEXT:    movzbl %dil, %eax
    829 ; CHECK-NEXT:    kmovw %eax, %k1
    830 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    831 ; CHECK-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1}
    832 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
    833 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
    834 ; CHECK-NEXT:    retq
    835   %res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
    836   %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
    837   %res2 = fadd <8 x float> %res, %res1
    838   ret <8 x float> %res2
    839 }
    840 
    841 declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
    842 
    843 define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
    844 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
    845 ; CHECK:       ## BB#0:
    846 ; CHECK-NEXT:    movzbl %dil, %eax
    847 ; CHECK-NEXT:    kmovw %eax, %k1
    848 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
    849 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
    850 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
    851 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
    852 ; CHECK-NEXT:    retq
    853   %res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
    854   %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
    855   %res2 = fadd <8 x float> %res, %res1
    856   ret <8 x float> %res2
    857 }
    858 
    859 
    860 declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
    861 
    862 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
    863 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
    864 ; CHECK:       ## BB#0:
    865 ; CHECK-NEXT:    movzbl %dil, %eax
    866 ; CHECK-NEXT:    kmovw %eax, %k1
    867 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    868 ; CHECK-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
    869 ; CHECK-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
    870 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
    871 ; CHECK-NEXT:    retq
    872   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
    873   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
    874   %res2 = fadd <2 x double> %res, %res1
    875   ret <2 x double> %res2
    876 }
    877 
    878 
    879 declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
    880 
    881 define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
    882 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
    883 ; CHECK:       ## BB#0:
    884 ; CHECK-NEXT:    movzbl %dil, %eax
    885 ; CHECK-NEXT:    kmovw %eax, %k1
    886 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    887 ; CHECK-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
    888 ; CHECK-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
    889 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
    890 ; CHECK-NEXT:    retq
    891   %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
    892   %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
    893   %res2 = fadd <4 x double> %res, %res1
    894   ret <4 x double> %res2
    895 }
    896 
    897 declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
    898 
    899 define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
    900 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
    901 ; CHECK:       ## BB#0:
    902 ; CHECK-NEXT:    movzbl %dil, %eax
    903 ; CHECK-NEXT:    kmovw %eax, %k1
    904 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    905 ; CHECK-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
    906 ; CHECK-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
    907 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
    908 ; CHECK-NEXT:    retq
    909   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
    910   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
    911   %res2 = fadd <4 x float> %res, %res1
    912   ret <4 x float> %res2
    913 }
    914 
    915 declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
    916 
    917 define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
    918 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
    919 ; CHECK:       ## BB#0:
    920 ; CHECK-NEXT:    movzbl %dil, %eax
    921 ; CHECK-NEXT:    kmovw %eax, %k1
    922 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
    923 ; CHECK-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
    924 ; CHECK-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
    925 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
    926 ; CHECK-NEXT:    retq
    927   %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
    928   %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
    929   %res2 = fadd <8 x float> %res, %res1
    930   ret <8 x float> %res2
    931 }
    932 
    933 declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
    934 
    935 define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
    936   ; CHECK-LABEL: test_mask_vfnmadd256_ps
    937   ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
    938   %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
    939   ret <8 x float> %res
    940 }
    941 
    942 declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
    943 
    944 define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
    945   ; CHECK-LABEL: test_mask_vfnmadd128_ps
    946   ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
    947   %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
    948   ret <4 x float> %res
    949 }
    950 
    951 declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
    952 
    953 define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
    954   ; CHECK-LABEL: test_mask_vfnmadd256_pd
    955   ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
    956   %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
    957   ret <4 x double> %res
    958 }
    959 
    960 declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
    961 
    962 define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
    963   ; CHECK-LABEL: test_mask_vfnmadd128_pd
    964   ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
    965   %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
    966   ret <2 x double> %res
    967 }
    968 
    969 declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
    970 
    971 define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
    972   ; CHECK-LABEL: test_mask_vfnmsub256_ps
    973   ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
    974   %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
    975   ret <8 x float> %res
    976 }
    977 
    978 declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
    979 
    980 define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
    981   ; CHECK-LABEL: test_mask_vfnmsub128_ps
    982   ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
    983   %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
    984   ret <4 x float> %res
    985 }
    986 
    987 declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
    988 
    989 define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
    990   ; CHECK-LABEL: test_mask_vfnmsub256_pd
    991   ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
    992   %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
    993   ret <4 x double> %res
    994 }
    995 
    996 declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
    997 
    998 define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
    999   ; CHECK-LABEL: test_mask_vfnmsub128_pd
   1000   ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
   1001   %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
   1002   ret <2 x double> %res
   1003 }
   1004 
   1005 
   1006 define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   1007 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
   1008 ; CHECK:       ## BB#0:
   1009 ; CHECK-NEXT:    movzbl %dil, %eax
   1010 ; CHECK-NEXT:    kmovw %eax, %k1
   1011 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1012 ; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1}
   1013 ; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
   1014 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   1015 ; CHECK-NEXT:    retq
   1016   %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   1017   %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
   1018   %res2 = fadd <2 x double> %res, %res1
   1019   ret <2 x double> %res2
   1020 }
   1021 
   1022 declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
   1023 
   1024 define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   1025 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
   1026 ; CHECK:       ## BB#0:
   1027 ; CHECK-NEXT:    movzbl %dil, %eax
   1028 ; CHECK-NEXT:    kmovw %eax, %k1
   1029 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1030 ; CHECK-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
   1031 ; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
   1032 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   1033 ; CHECK-NEXT:    retq
   1034   %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   1035   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
   1036   %res2 = fadd <2 x double> %res, %res1
   1037   ret <2 x double> %res2
   1038 }
   1039 
   1040 define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
   1041 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
   1042 ; CHECK:       ## BB#0:
   1043 ; CHECK-NEXT:    movzbl %dil, %eax
   1044 ; CHECK-NEXT:    kmovw %eax, %k1
   1045 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1046 ; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1}
   1047 ; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
   1048 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
   1049 ; CHECK-NEXT:    retq
   1050   %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
   1051   %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
   1052   %res2 = fadd <4 x double> %res, %res1
   1053   ret <4 x double> %res2
   1054 }
   1055 
   1056 declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
   1057 
   1058 define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
   1059 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
   1060 ; CHECK:       ## BB#0:
   1061 ; CHECK-NEXT:    movzbl %dil, %eax
   1062 ; CHECK-NEXT:    kmovw %eax, %k1
   1063 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1064 ; CHECK-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
   1065 ; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
   1066 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
   1067 ; CHECK-NEXT:    retq
   1068   %res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
   1069   %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
   1070   %res2 = fadd <4 x double> %res, %res1
   1071   ret <4 x double> %res2
   1072 }
   1073 
   1074 define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   1075 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
   1076 ; CHECK:       ## BB#0:
   1077 ; CHECK-NEXT:    movzbl %dil, %eax
   1078 ; CHECK-NEXT:    kmovw %eax, %k1
   1079 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1080 ; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1}
   1081 ; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
   1082 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   1083 ; CHECK-NEXT:    retq
   1084   %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   1085   %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
   1086   %res2 = fadd <4 x float> %res, %res1
   1087   ret <4 x float> %res2
   1088 }
   1089 
   1090 declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
   1091 
   1092 define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   1093 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
   1094 ; CHECK:       ## BB#0:
   1095 ; CHECK-NEXT:    movzbl %dil, %eax
   1096 ; CHECK-NEXT:    kmovw %eax, %k1
   1097 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1098 ; CHECK-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
   1099 ; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
   1100 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   1101 ; CHECK-NEXT:    retq
   1102   %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   1103   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
   1104   %res2 = fadd <4 x float> %res, %res1
   1105   ret <4 x float> %res2
   1106 }
   1107 
   1108 define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
   1109 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
   1110 ; CHECK:       ## BB#0:
   1111 ; CHECK-NEXT:    movzbl %dil, %eax
   1112 ; CHECK-NEXT:    kmovw %eax, %k1
   1113 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1114 ; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1}
   1115 ; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
   1116 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
   1117 ; CHECK-NEXT:    retq
   1118   %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
   1119   %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
   1120   %res2 = fadd <8 x float> %res, %res1
   1121   ret <8 x float> %res2
   1122 }
   1123 
   1124 declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
   1125 
   1126 define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
   1127 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
   1128 ; CHECK:       ## BB#0:
   1129 ; CHECK-NEXT:    movzbl %dil, %eax
   1130 ; CHECK-NEXT:    kmovw %eax, %k1
   1131 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1132 ; CHECK-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
   1133 ; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
   1134 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
   1135 ; CHECK-NEXT:    retq
   1136   %res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
   1137   %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
   1138   %res2 = fadd <8 x float> %res, %res1
   1139   ret <8 x float> %res2
   1140 }
   1141 
   1142 define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   1143 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
   1144 ; CHECK:       ## BB#0:
   1145 ; CHECK-NEXT:    movzbl %dil, %eax
   1146 ; CHECK-NEXT:    kmovw %eax, %k1
   1147 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1148 ; CHECK-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
   1149 ; CHECK-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
   1150 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   1151 ; CHECK-NEXT:    retq
   1152   %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   1153   %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
   1154   %res2 = fadd <2 x double> %res, %res1
   1155   ret <2 x double> %res2
   1156 }
   1157 
   1158 define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
   1159 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
   1160 ; CHECK:       ## BB#0:
   1161 ; CHECK-NEXT:    movzbl %dil, %eax
   1162 ; CHECK-NEXT:    kmovw %eax, %k1
   1163 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1164 ; CHECK-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
   1165 ; CHECK-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
   1166 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
   1167 ; CHECK-NEXT:    retq
   1168   %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
   1169   %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
   1170   %res2 = fadd <4 x double> %res, %res1
   1171   ret <4 x double> %res2
   1172 }
   1173 
   1174 define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   1175 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
   1176 ; CHECK:       ## BB#0:
   1177 ; CHECK-NEXT:    movzbl %dil, %eax
   1178 ; CHECK-NEXT:    kmovw %eax, %k1
   1179 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1180 ; CHECK-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
   1181 ; CHECK-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
   1182 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   1183 ; CHECK-NEXT:    retq
   1184   %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   1185   %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
   1186   %res2 = fadd <4 x float> %res, %res1
   1187   ret <4 x float> %res2
   1188 }
   1189 
   1190 define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
   1191 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
   1192 ; CHECK:       ## BB#0:
   1193 ; CHECK-NEXT:    movzbl %dil, %eax
   1194 ; CHECK-NEXT:    kmovw %eax, %k1
   1195 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1196 ; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
   1197 ; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
   1198 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
   1199 ; CHECK-NEXT:    retq
   1200   %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
   1201   %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
   1202   %res2 = fadd <8 x float> %res, %res1
   1203   ret <8 x float> %res2
   1204 }
   1205 
   1206 declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
   1207 
   1208 define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
   1209 ; CHECK-LABEL: test_mask_fmaddsub256_ps:
   1210 ; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
   1211   %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
   1212   ret <8 x float> %res
   1213 }
   1214 
   1215 declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
   1216 
   1217 define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
   1218 ; CHECK-LABEL: test_mask_fmaddsub128_ps:
   1219 ; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
   1220   %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
   1221   ret <4 x float> %res
   1222 }
   1223 
   1224 declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
   1225 
   1226 define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
   1227   ; CHECK-LABEL: test_mask_vfmaddsub256_pd
   1228   ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
   1229   %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
   1230   ret <4 x double> %res
   1231 }
   1232 
   1233 declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
   1234 
   1235 define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   1236   ; CHECK-LABEL: test_mask_vfmaddsub128_pd
   1237   ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
   1238   %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
   1239   ret <2 x double> %res
   1240 }
   1241 
   1242 define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   1243 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
   1244 ; CHECK:       ## BB#0:
   1245 ; CHECK-NEXT:    movzbl %dil, %eax
   1246 ; CHECK-NEXT:    kmovw %eax, %k1
   1247 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1248 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1}
   1249 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
   1250 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   1251 ; CHECK-NEXT:    retq
   1252   %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   1253   %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
   1254   %res2 = fadd <2 x double> %res, %res1
   1255   ret <2 x double> %res2
   1256 }
   1257 
   1258 declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
   1259 
   1260 define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   1261 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
   1262 ; CHECK:       ## BB#0:
   1263 ; CHECK-NEXT:    movzbl %dil, %eax
   1264 ; CHECK-NEXT:    kmovw %eax, %k1
   1265 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1266 ; CHECK-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1}
   1267 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
   1268 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   1269 ; CHECK-NEXT:    retq
   1270   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   1271   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
   1272   %res2 = fadd <2 x double> %res, %res1
   1273   ret <2 x double> %res2
   1274 }
   1275 
   1276 declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
   1277 
   1278 define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   1279 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
   1280 ; CHECK:       ## BB#0:
   1281 ; CHECK-NEXT:    movzbl %dil, %eax
   1282 ; CHECK-NEXT:    kmovw %eax, %k1
   1283 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1284 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
   1285 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
   1286 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   1287 ; CHECK-NEXT:    retq
   1288   %res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   1289   %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
   1290   %res2 = fadd <2 x double> %res, %res1
   1291   ret <2 x double> %res2
   1292 }
   1293 
   1294 define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
   1295 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
   1296 ; CHECK:       ## BB#0:
   1297 ; CHECK-NEXT:    movzbl %dil, %eax
   1298 ; CHECK-NEXT:    kmovw %eax, %k1
   1299 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1300 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1}
   1301 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
   1302 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
   1303 ; CHECK-NEXT:    retq
   1304   %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
   1305   %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
   1306   %res2 = fadd <4 x double> %res, %res1
   1307   ret <4 x double> %res2
   1308 }
   1309 
   1310 declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
   1311 
   1312 define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
   1313 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
   1314 ; CHECK:       ## BB#0:
   1315 ; CHECK-NEXT:    movzbl %dil, %eax
   1316 ; CHECK-NEXT:    kmovw %eax, %k1
   1317 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1318 ; CHECK-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1}
   1319 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
   1320 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
   1321 ; CHECK-NEXT:    retq
   1322   %res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
   1323   %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
   1324   %res2 = fadd <4 x double> %res, %res1
   1325   ret <4 x double> %res2
   1326 }
   1327 
   1328 declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
   1329 
   1330 define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
   1331 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
   1332 ; CHECK:       ## BB#0:
   1333 ; CHECK-NEXT:    movzbl %dil, %eax
   1334 ; CHECK-NEXT:    kmovw %eax, %k1
   1335 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1336 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
   1337 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
   1338 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
   1339 ; CHECK-NEXT:    retq
   1340   %res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
   1341   %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
   1342   %res2 = fadd <4 x double> %res, %res1
   1343   ret <4 x double> %res2
   1344 }
   1345 
   1346 define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   1347 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
   1348 ; CHECK:       ## BB#0:
   1349 ; CHECK-NEXT:    movzbl %dil, %eax
   1350 ; CHECK-NEXT:    kmovw %eax, %k1
   1351 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1352 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1}
   1353 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
   1354 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   1355 ; CHECK-NEXT:    retq
   1356   %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   1357   %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
   1358   %res2 = fadd <4 x float> %res, %res1
   1359   ret <4 x float> %res2
   1360 }
   1361 
   1362 declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
   1363 
   1364 define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   1365 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
   1366 ; CHECK:       ## BB#0:
   1367 ; CHECK-NEXT:    movzbl %dil, %eax
   1368 ; CHECK-NEXT:    kmovw %eax, %k1
   1369 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1370 ; CHECK-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1}
   1371 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
   1372 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   1373 ; CHECK-NEXT:    retq
   1374   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   1375   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
   1376   %res2 = fadd <4 x float> %res, %res1
   1377   ret <4 x float> %res2
   1378 }
   1379 
   1380 declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
   1381 
   1382 define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   1383 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
   1384 ; CHECK:       ## BB#0:
   1385 ; CHECK-NEXT:    movzbl %dil, %eax
   1386 ; CHECK-NEXT:    kmovw %eax, %k1
   1387 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1388 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
   1389 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
   1390 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   1391 ; CHECK-NEXT:    retq
   1392   %res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   1393   %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
   1394   %res2 = fadd <4 x float> %res, %res1
   1395   ret <4 x float> %res2
   1396 }
   1397 
   1398 define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
   1399 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
   1400 ; CHECK:       ## BB#0:
   1401 ; CHECK-NEXT:    movzbl %dil, %eax
   1402 ; CHECK-NEXT:    kmovw %eax, %k1
   1403 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1404 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1}
   1405 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
   1406 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
   1407 ; CHECK-NEXT:    retq
   1408   %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
   1409   %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
   1410   %res2 = fadd <8 x float> %res, %res1
   1411   ret <8 x float> %res2
   1412 }
   1413 
   1414 declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
   1415 
   1416 define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
   1417 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
   1418 ; CHECK:       ## BB#0:
   1419 ; CHECK-NEXT:    movzbl %dil, %eax
   1420 ; CHECK-NEXT:    kmovw %eax, %k1
   1421 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1422 ; CHECK-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1}
   1423 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
   1424 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
   1425 ; CHECK-NEXT:    retq
   1426   %res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
   1427   %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
   1428   %res2 = fadd <8 x float> %res, %res1
   1429   ret <8 x float> %res2
   1430 }
   1431 
   1432 declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
   1433 
   1434 define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
   1435 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
   1436 ; CHECK:       ## BB#0:
   1437 ; CHECK-NEXT:    movzbl %dil, %eax
   1438 ; CHECK-NEXT:    kmovw %eax, %k1
   1439 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
   1440 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
   1441 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
   1442 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
   1443 ; CHECK-NEXT:    retq
   1444   %res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
   1445   %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
   1446   %res2 = fadd <8 x float> %res, %res1
   1447   ret <8 x float> %res2
   1448 }
   1449 
   1450 declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
   1451 
   1452 define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
   1453 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
   1454 ; CHECK:       ## BB#0:
   1455 ; CHECK-NEXT:    movzbl %dil, %eax
   1456 ; CHECK-NEXT:    kmovw %eax, %k1
   1457 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1458 ; CHECK-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1}
   1459 ; CHECK-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
   1460 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
   1461 ; CHECK-NEXT:    retq
   1462   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   1463   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
   1464   %res2=fadd <2 x double> %res, %res1
   1465   ret <2 x double> %res2
   1466 }
   1467 
   1468 declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
   1469 
   1470 define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
   1471 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
   1472 ; CHECK:       ## BB#0:
   1473 ; CHECK-NEXT:    movzbl %dil, %eax
   1474 ; CHECK-NEXT:    kmovw %eax, %k1
   1475 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1476 ; CHECK-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1}
   1477 ; CHECK-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0
   1478 ; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
   1479 ; CHECK-NEXT:    retq
   1480   %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
   1481   %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
   1482   %res2=fadd <4 x double> %res, %res1
   1483   ret <4 x double> %res2
   1484 }
   1485 
   1486 declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
   1487 
   1488 define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
   1489 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
   1490 ; CHECK:       ## BB#0:
   1491 ; CHECK-NEXT:    movzbl %dil, %eax
   1492 ; CHECK-NEXT:    kmovw %eax, %k1
   1493 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1494 ; CHECK-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1}
   1495 ; CHECK-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0
   1496 ; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
   1497 ; CHECK-NEXT:    retq
   1498   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   1499   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
   1500   %res2=fadd <4 x float> %res, %res1
   1501   ret <4 x float> %res2
   1502 }
   1503 
   1504 declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
   1505 
   1506 define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
   1507 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
   1508 ; CHECK:       ## BB#0:
   1509 ; CHECK-NEXT:    movzbl %dil, %eax
   1510 ; CHECK-NEXT:    kmovw %eax, %k1
   1511 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
   1512 ; CHECK-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1}
   1513 ; CHECK-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0
   1514 ; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
   1515 ; CHECK-NEXT:    retq
   1516   %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
   1517   %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
   1518   %res2=fadd <8 x float> %res, %res1
   1519   ret <8 x float> %res2
   1520 }
   1521 
   1522 
   1523 define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
   1524   ; CHECK-LABEL: test_mask_vfmadd128_ps_r
   1525   ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
   1526   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
   1527   ret <4 x float> %res
   1528 }
   1529 
   1530 define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   1531   ; CHECK-LABEL: test_mask_vfmadd128_ps_rz
   1532   ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
   1533   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
   1534   ret <4 x float> %res
   1535 }
   1536 
   1537 define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
   1538   ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk
   1539   ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
   1540   %a2 = load <4 x float>, <4 x float>* %ptr_a2
   1541   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
   1542   ret <4 x float> %res
   1543 }
   1544 
   1545 define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
   1546   ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka
   1547   ; CHECK: vfmadd213ps     (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
   1548   %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
   1549   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
   1550   ret <4 x float> %res
   1551 }
   1552 
   1553 define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
   1554   ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz
   1555   ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
   1556   %a2 = load <4 x float>, <4 x float>* %ptr_a2
   1557   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
   1558   ret <4 x float> %res
   1559 }
   1560 
   1561 define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
   1562   ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza
   1563   ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
   1564   %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
   1565   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
   1566   ret <4 x float> %res
   1567 }
   1568 
   1569 define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
   1570   ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb
   1571   ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
   1572   %q = load float, float* %ptr_a2
   1573   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
   1574   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   1575   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   1576   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
   1577   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
   1578   ret <4 x float> %res
   1579 }
   1580 
   1581 define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
   1582   ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba
   1583   ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
   1584   %q = load float, float* %ptr_a2, align 4
   1585   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
   1586   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   1587   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   1588   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
   1589   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
   1590   ret <4 x float> %res
   1591 }
   1592 
   1593 define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
   1594   ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz
   1595   ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0  ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
   1596   %q = load float, float* %ptr_a2
   1597   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
   1598   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   1599   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   1600   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
   1601   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
   1602   ret <4 x float> %res
   1603 }
   1604 
   1605 define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
   1606   ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza
   1607   ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0  ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
   1608   %q = load float, float* %ptr_a2, align 4
   1609   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
   1610   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   1611   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   1612   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
   1613   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
   1614   ret <4 x float> %res
   1615 }
   1616 
   1617 define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
   1618   ; CHECK-LABEL: test_mask_vfmadd128_pd_r
   1619   ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
   1620   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
   1621   ret <2 x double> %res
   1622 }
   1623 
   1624 define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
   1625   ; CHECK-LABEL: test_mask_vfmadd128_pd_rz
   1626   ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
   1627   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
   1628   ret <2 x double> %res
   1629 }
   1630 
   1631 define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
   1632   ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk
   1633   ; CHECK: vfmadd213pd	(%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
   1634   %a2 = load <2 x double>, <2 x double>* %ptr_a2
   1635   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
   1636   ret <2 x double> %res
   1637 }
   1638 
   1639 define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
   1640   ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz
   1641   ; CHECK: vfmadd213pd	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
   1642   %a2 = load <2 x double>, <2 x double>* %ptr_a2
   1643   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
   1644   ret <2 x double> %res
   1645 }
   1646 
   1647 define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
   1648   ; CHECK-LABEL: test_mask_vfmadd256_pd_r
   1649   ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
   1650   %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
   1651   ret <4 x double> %res
   1652 }
   1653 
   1654 define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   1655   ; CHECK-LABEL: test_mask_vfmadd256_pd_rz
   1656   ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
   1657   %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
   1658   ret <4 x double> %res
   1659 }
   1660 
   1661 define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
   1662   ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk
   1663   ; CHECK: vfmadd213pd	(%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
   1664   %a2 = load <4 x double>, <4 x double>* %ptr_a2
   1665   %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
   1666   ret <4 x double> %res
   1667 }
   1668 
   1669 define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
   1670   ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz
   1671   ; CHECK: vfmadd213pd	(%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
   1672   %a2 = load <4 x double>, <4 x double>* %ptr_a2
   1673   %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
   1674   ret <4 x double> %res
   1675 }
   1676 define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   1677   ;CHECK-LABEL: test_mask_add_epi16_rr_128
   1678   ;CHECK: vpaddw %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
   1679   %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   1680   ret <8 x i16> %res
   1681 }
   1682 
   1683 define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
   1684   ;CHECK-LABEL: test_mask_add_epi16_rrk_128
   1685   ;CHECK: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1]
   1686   %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   1687   ret <8 x i16> %res
   1688 }
   1689 
   1690 define <8 x i16> @test_mask_add_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
   1691   ;CHECK-LABEL: test_mask_add_epi16_rrkz_128
   1692   ;CHECK: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1]
   1693   %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   1694   ret <8 x i16> %res
   1695 }
   1696 
   1697 define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   1698   ;CHECK-LABEL: test_mask_add_epi16_rm_128
   1699   ;CHECK: vpaddw (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0x07]
   1700   %b = load <8 x i16>, <8 x i16>* %ptr_b
   1701   %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   1702   ret <8 x i16> %res
   1703 }
   1704 
   1705 define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   1706   ;CHECK-LABEL: test_mask_add_epi16_rmk_128
   1707   ;CHECK: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f]
   1708   %b = load <8 x i16>, <8 x i16>* %ptr_b
   1709   %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   1710   ret <8 x i16> %res
   1711 }
   1712 
   1713 define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
   1714   ;CHECK-LABEL: test_mask_add_epi16_rmkz_128
   1715   ;CHECK: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07]
   1716   %b = load <8 x i16>, <8 x i16>* %ptr_b
   1717   %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   1718   ret <8 x i16> %res
   1719 }
   1720 
   1721 declare <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   1722 
   1723 define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   1724   ;CHECK-LABEL: test_mask_add_epi16_rr_256
   1725   ;CHECK: vpaddw %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
   1726   %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   1727   ret <16 x i16> %res
   1728 }
   1729 
   1730 define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
   1731   ;CHECK-LABEL: test_mask_add_epi16_rrk_256
   1732   ;CHECK: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1]
   1733   %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   1734   ret <16 x i16> %res
   1735 }
   1736 
   1737 define <16 x i16> @test_mask_add_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
   1738   ;CHECK-LABEL: test_mask_add_epi16_rrkz_256
   1739   ;CHECK: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1]
   1740   %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   1741   ret <16 x i16> %res
   1742 }
   1743 
   1744 define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   1745   ;CHECK-LABEL: test_mask_add_epi16_rm_256
   1746   ;CHECK: vpaddw (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0x07]
   1747   %b = load <16 x i16>, <16 x i16>* %ptr_b
   1748   %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   1749   ret <16 x i16> %res
   1750 }
   1751 
   1752 define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   1753   ;CHECK-LABEL: test_mask_add_epi16_rmk_256
   1754   ;CHECK: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f]
   1755   %b = load <16 x i16>, <16 x i16>* %ptr_b
   1756   %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   1757   ret <16 x i16> %res
   1758 }
   1759 
   1760 define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
   1761   ;CHECK-LABEL: test_mask_add_epi16_rmkz_256
   1762   ;CHECK: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07]
   1763   %b = load <16 x i16>, <16 x i16>* %ptr_b
   1764   %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   1765   ret <16 x i16> %res
   1766 }
   1767 
   1768 declare <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   1769 
   1770 define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   1771   ;CHECK-LABEL: test_mask_sub_epi16_rr_128
   1772   ;CHECK: vpsubw %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0xc1]
   1773   %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   1774   ret <8 x i16> %res
   1775 }
   1776 
   1777 define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
   1778   ;CHECK-LABEL: test_mask_sub_epi16_rrk_128
   1779   ;CHECK: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1]
   1780   %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   1781   ret <8 x i16> %res
   1782 }
   1783 
   1784 define <8 x i16> @test_mask_sub_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
   1785   ;CHECK-LABEL: test_mask_sub_epi16_rrkz_128
   1786   ;CHECK: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1]
   1787   %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   1788   ret <8 x i16> %res
   1789 }
   1790 
   1791 define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   1792   ;CHECK-LABEL: test_mask_sub_epi16_rm_128
   1793   ;CHECK: vpsubw (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0x07]
   1794   %b = load <8 x i16>, <8 x i16>* %ptr_b
   1795   %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   1796   ret <8 x i16> %res
   1797 }
   1798 
   1799 define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   1800   ;CHECK-LABEL: test_mask_sub_epi16_rmk_128
   1801   ;CHECK: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f]
   1802   %b = load <8 x i16>, <8 x i16>* %ptr_b
   1803   %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   1804   ret <8 x i16> %res
   1805 }
   1806 
   1807 define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
   1808   ;CHECK-LABEL: test_mask_sub_epi16_rmkz_128
   1809   ;CHECK: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07]
   1810   %b = load <8 x i16>, <8 x i16>* %ptr_b
   1811   %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   1812   ret <8 x i16> %res
   1813 }
   1814 
   1815 declare <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   1816 
   1817 define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   1818   ;CHECK-LABEL: test_mask_sub_epi16_rr_256
   1819   ;CHECK: vpsubw %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0xc1]
   1820   %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   1821   ret <16 x i16> %res
   1822 }
   1823 
   1824 define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
   1825   ;CHECK-LABEL: test_mask_sub_epi16_rrk_256
   1826   ;CHECK: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1]
   1827   %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   1828   ret <16 x i16> %res
   1829 }
   1830 
   1831 define <16 x i16> @test_mask_sub_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
   1832   ;CHECK-LABEL: test_mask_sub_epi16_rrkz_256
   1833   ;CHECK: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1]
   1834   %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   1835   ret <16 x i16> %res
   1836 }
   1837 
   1838 define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   1839   ;CHECK-LABEL: test_mask_sub_epi16_rm_256
   1840   ;CHECK: vpsubw (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0x07]
   1841   %b = load <16 x i16>, <16 x i16>* %ptr_b
   1842   %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   1843   ret <16 x i16> %res
   1844 }
   1845 
   1846 define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   1847   ;CHECK-LABEL: test_mask_sub_epi16_rmk_256
   1848   ;CHECK: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f]
   1849   %b = load <16 x i16>, <16 x i16>* %ptr_b
   1850   %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   1851   ret <16 x i16> %res
   1852 }
   1853 
   1854 define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
   1855   ;CHECK-LABEL: test_mask_sub_epi16_rmkz_256
   1856   ;CHECK: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07]
   1857   %b = load <16 x i16>, <16 x i16>* %ptr_b
   1858   %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   1859   ret <16 x i16> %res
   1860 }
   1861 
   1862 declare <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   1863 
   1864 define <32 x i16> @test_mask_add_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1865   ;CHECK-LABEL: test_mask_add_epi16_rr_512
   1866   ;CHECK: vpaddw %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
   1867   %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1868   ret <32 x i16> %res
   1869 }
   1870 
   1871 define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
   1872   ;CHECK-LABEL: test_mask_add_epi16_rrk_512
   1873   ;CHECK: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1]
   1874   %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1875   ret <32 x i16> %res
   1876 }
   1877 
   1878 define <32 x i16> @test_mask_add_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
   1879   ;CHECK-LABEL: test_mask_add_epi16_rrkz_512
   1880   ;CHECK: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1]
   1881   %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1882   ret <32 x i16> %res
   1883 }
   1884 
   1885 define <32 x i16> @test_mask_add_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1886   ;CHECK-LABEL: test_mask_add_epi16_rm_512
   1887   ;CHECK: vpaddw (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07]
   1888   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1889   %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1890   ret <32 x i16> %res
   1891 }
   1892 
   1893 define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1894   ;CHECK-LABEL: test_mask_add_epi16_rmk_512
   1895   ;CHECK: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f]
   1896   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1897   %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1898   ret <32 x i16> %res
   1899 }
   1900 
   1901 define <32 x i16> @test_mask_add_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
   1902   ;CHECK-LABEL: test_mask_add_epi16_rmkz_512
   1903   ;CHECK: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07]
   1904   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1905   %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1906   ret <32 x i16> %res
   1907 }
   1908 
   1909 declare <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1910 
   1911 define <32 x i16> @test_mask_sub_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1912   ;CHECK-LABEL: test_mask_sub_epi16_rr_512
   1913   ;CHECK: vpsubw %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1]
   1914   %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1915   ret <32 x i16> %res
   1916 }
   1917 
   1918 define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
   1919   ;CHECK-LABEL: test_mask_sub_epi16_rrk_512
   1920   ;CHECK: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1]
   1921   %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1922   ret <32 x i16> %res
   1923 }
   1924 
   1925 define <32 x i16> @test_mask_sub_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
   1926   ;CHECK-LABEL: test_mask_sub_epi16_rrkz_512
   1927   ;CHECK: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1]
   1928   %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1929   ret <32 x i16> %res
   1930 }
   1931 
   1932 define <32 x i16> @test_mask_sub_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1933   ;CHECK-LABEL: test_mask_sub_epi16_rm_512
   1934   ;CHECK: vpsubw (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07]
   1935   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1936   %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1937   ret <32 x i16> %res
   1938 }
   1939 
   1940 define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1941   ;CHECK-LABEL: test_mask_sub_epi16_rmk_512
   1942   ;CHECK: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f]
   1943   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1944   %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1945   ret <32 x i16> %res
   1946 }
   1947 
   1948 define <32 x i16> @test_mask_sub_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
   1949   ;CHECK-LABEL: test_mask_sub_epi16_rmkz_512
   1950   ;CHECK: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07]
   1951   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1952   %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1953   ret <32 x i16> %res
   1954 }
   1955 
   1956 declare <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   1957 
   1958 define <32 x i16> @test_mask_mullo_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   1959   ;CHECK-LABEL: test_mask_mullo_epi16_rr_512
   1960   ;CHECK: vpmullw %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1]
   1961   %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1962   ret <32 x i16> %res
   1963 }
   1964 
   1965 define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
   1966   ;CHECK-LABEL: test_mask_mullo_epi16_rrk_512
   1967   ;CHECK: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1]
   1968   %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1969   ret <32 x i16> %res
   1970 }
   1971 
   1972 define <32 x i16> @test_mask_mullo_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
   1973   ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_512
   1974   ;CHECK: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1]
   1975   %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   1976   ret <32 x i16> %res
   1977 }
   1978 
   1979 define <32 x i16> @test_mask_mullo_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   1980   ;CHECK-LABEL: test_mask_mullo_epi16_rm_512
   1981   ;CHECK: vpmullw (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07]
   1982   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1983   %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   1984   ret <32 x i16> %res
   1985 }
   1986 
   1987 define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
   1988   ;CHECK-LABEL: test_mask_mullo_epi16_rmk_512
   1989   ;CHECK: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f]
   1990   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1991   %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   1992   ret <32 x i16> %res
   1993 }
   1994 
   1995 define <32 x i16> @test_mask_mullo_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
   1996   ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_512
   1997   ;CHECK: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07]
   1998   %b = load <32 x i16>, <32 x i16>* %ptr_b
   1999   %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   2000   ret <32 x i16> %res
   2001 }
   2002 
   2003 declare <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
   2004 
   2005 define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   2006   ;CHECK-LABEL: test_mask_mullo_epi16_rr_128
   2007   ;CHECK: vpmullw %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0xc1]
   2008   %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2009   ret <8 x i16> %res
   2010 }
   2011 
   2012 define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
   2013   ;CHECK-LABEL: test_mask_mullo_epi16_rrk_128
   2014   ;CHECK: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1]
   2015   %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2016   ret <8 x i16> %res
   2017 }
   2018 
   2019 define <8 x i16> @test_mask_mullo_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
   2020   ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_128
   2021   ;CHECK: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1]
   2022   %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2023   ret <8 x i16> %res
   2024 }
   2025 
   2026 define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   2027   ;CHECK-LABEL: test_mask_mullo_epi16_rm_128
   2028   ;CHECK: vpmullw (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0x07]
   2029   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2030   %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2031   ret <8 x i16> %res
   2032 }
   2033 
   2034 define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2035   ;CHECK-LABEL: test_mask_mullo_epi16_rmk_128
   2036   ;CHECK: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f]
   2037   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2038   %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2039   ret <8 x i16> %res
   2040 }
   2041 
   2042 define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
   2043   ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_128
   2044   ;CHECK: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07]
   2045   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2046   %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2047   ret <8 x i16> %res
   2048 }
   2049 
   2050 declare <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   2051 
   2052 define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   2053   ;CHECK-LABEL: test_mask_mullo_epi16_rr_256
   2054   ;CHECK: vpmullw %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0xc1]
   2055   %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2056   ret <16 x i16> %res
   2057 }
   2058 
   2059 define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
   2060   ;CHECK-LABEL: test_mask_mullo_epi16_rrk_256
   2061   ;CHECK: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1]
   2062   %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2063   ret <16 x i16> %res
   2064 }
   2065 
   2066 define <16 x i16> @test_mask_mullo_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
   2067   ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_256
   2068   ;CHECK: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1]
   2069   %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2070   ret <16 x i16> %res
   2071 }
   2072 
   2073 define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   2074   ;CHECK-LABEL: test_mask_mullo_epi16_rm_256
   2075   ;CHECK: vpmullw (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0x07]
   2076   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2077   %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2078   ret <16 x i16> %res
   2079 }
   2080 
   2081 define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2082   ;CHECK-LABEL: test_mask_mullo_epi16_rmk_256
   2083   ;CHECK: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f]
   2084   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2085   %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2086   ret <16 x i16> %res
   2087 }
   2088 
   2089 define <16 x i16> @test_mask_mullo_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
   2090   ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_256
   2091   ;CHECK: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07]
   2092   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2093   %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2094   ret <16 x i16> %res
   2095 }
   2096 
   2097 declare <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   2098 
   2099 
   2100 define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
   2101   ;CHECK-LABEL: test_mask_packs_epi32_rr_128
   2102   ;CHECK: vpackssdw       %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1]
   2103   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
   2104   ret <8 x i16> %res
   2105 }
   2106 
   2107 define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
   2108   ;CHECK-LABEL: test_mask_packs_epi32_rrk_128
   2109   ;CHECK: vpackssdw       %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
   2110   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
   2111   ret <8 x i16> %res
   2112 }
   2113 
   2114 define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
   2115   ;CHECK-LABEL: test_mask_packs_epi32_rrkz_128
   2116   ;CHECK: vpackssdw       %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
   2117   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
   2118   ret <8 x i16> %res
   2119 }
   2120 
   2121 define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
   2122   ;CHECK-LABEL: test_mask_packs_epi32_rm_128
   2123   ;CHECK: vpackssdw       (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0x07]
   2124   %b = load <4 x i32>, <4 x i32>* %ptr_b
   2125   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
   2126   ret <8 x i16> %res
   2127 }
   2128 
   2129 define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2130   ;CHECK-LABEL: test_mask_packs_epi32_rmk_128
   2131   ;CHECK: vpackssdw       (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
   2132   %b = load <4 x i32>, <4 x i32>* %ptr_b
   2133   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
   2134   ret <8 x i16> %res
   2135 }
   2136 
   2137 define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
   2138   ;CHECK-LABEL: test_mask_packs_epi32_rmkz_128
   2139   ;CHECK: vpackssdw       (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
   2140   %b = load <4 x i32>, <4 x i32>* %ptr_b
   2141   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
   2142   ret <8 x i16> %res
   2143 }
   2144 
   2145 define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
   2146   ;CHECK-LABEL: test_mask_packs_epi32_rmb_128
   2147   ;CHECK: vpackssdw       (%rdi){1to4}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
   2148   %q = load i32, i32* %ptr_b
   2149   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   2150   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
   2151   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
   2152   ret <8 x i16> %res
   2153 }
   2154 
   2155 define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2156   ;CHECK-LABEL: test_mask_packs_epi32_rmbk_128
   2157   ;CHECK: vpackssdw       (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
   2158   %q = load i32, i32* %ptr_b
   2159   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   2160   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
   2161   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
   2162   ret <8 x i16> %res
   2163 }
   2164 
   2165 define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
   2166   ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_128
   2167   ;CHECK: vpackssdw       (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
   2168   %q = load i32, i32* %ptr_b
   2169   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   2170   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
   2171   %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
   2172   ret <8 x i16> %res
   2173 }
   2174 
   2175 declare <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
   2176 
   2177 define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
   2178   ;CHECK-LABEL: test_mask_packs_epi32_rr_256
   2179   ;CHECK: vpackssdw       %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
   2180   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
   2181   ret <16 x i16> %res
   2182 }
   2183 
   2184 define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
   2185   ;CHECK-LABEL: test_mask_packs_epi32_rrk_256
   2186   ;CHECK: vpackssdw       %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
   2187   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
   2188   ret <16 x i16> %res
   2189 }
   2190 
   2191 define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
   2192   ;CHECK-LABEL: test_mask_packs_epi32_rrkz_256
   2193   ;CHECK: vpackssdw       %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
   2194   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
   2195   ret <16 x i16> %res
   2196 }
   2197 
   2198 define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
   2199   ;CHECK-LABEL: test_mask_packs_epi32_rm_256
   2200   ;CHECK: vpackssdw       (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0x07]
   2201   %b = load <8 x i32>, <8 x i32>* %ptr_b
   2202   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
   2203   ret <16 x i16> %res
   2204 }
   2205 
   2206 define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2207   ;CHECK-LABEL: test_mask_packs_epi32_rmk_256
   2208   ;CHECK: vpackssdw       (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
   2209   %b = load <8 x i32>, <8 x i32>* %ptr_b
   2210   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
   2211   ret <16 x i16> %res
   2212 }
   2213 
   2214 define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
   2215   ;CHECK-LABEL: test_mask_packs_epi32_rmkz_256
   2216   ;CHECK: vpackssdw       (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
   2217   %b = load <8 x i32>, <8 x i32>* %ptr_b
   2218   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
   2219   ret <16 x i16> %res
   2220 }
   2221 
   2222 define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
   2223   ;CHECK-LABEL: test_mask_packs_epi32_rmb_256
   2224   ;CHECK: vpackssdw       (%rdi){1to8}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
   2225   %q = load i32, i32* %ptr_b
   2226   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   2227   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
   2228   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
   2229   ret <16 x i16> %res
   2230 }
   2231 
   2232 define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2233   ;CHECK-LABEL: test_mask_packs_epi32_rmbk_256
   2234   ;CHECK: vpackssdw       (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
   2235   %q = load i32, i32* %ptr_b
   2236   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   2237   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
   2238   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
   2239   ret <16 x i16> %res
   2240 }
   2241 
   2242 define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
   2243   ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_256
   2244   ;CHECK: vpackssdw       (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]  
   2245   %q = load i32, i32* %ptr_b
   2246   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   2247   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
   2248   %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
   2249   ret <16 x i16> %res
   2250 }
   2251 
   2252 declare <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
   2253 
   2254 define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   2255   ;CHECK-LABEL: test_mask_packs_epi16_rr_128
   2256   ;CHECK: vpacksswb       %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1]
   2257   %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
   2258   ret <16 x i8> %res
   2259 }
   2260 
   2261 define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
   2262   ;CHECK-LABEL: test_mask_packs_epi16_rrk_128
   2263   ;CHECK: vpacksswb       %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0xd1]
   2264   %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
   2265   ret <16 x i8> %res
   2266 }
   2267 
   2268 define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
   2269   ;CHECK-LABEL: test_mask_packs_epi16_rrkz_128
   2270   ;CHECK: vpacksswb       %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0xc1]
   2271   %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
   2272   ret <16 x i8> %res
   2273 }
   2274 
   2275 define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   2276   ;CHECK-LABEL: test_mask_packs_epi16_rm_128
   2277   ;CHECK: vpacksswb       (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0x07]
   2278   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2279   %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
   2280   ret <16 x i8> %res
   2281 }
   2282 
   2283 define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
   2284   ;CHECK-LABEL: test_mask_packs_epi16_rmk_128
   2285   ;CHECK: vpacksswb       (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0x0f]
   2286   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2287   %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
   2288   ret <16 x i8> %res
   2289 }
   2290 
   2291 define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
   2292   ;CHECK-LABEL: test_mask_packs_epi16_rmkz_128
   2293   ;CHECK: vpacksswb       (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0x07]
   2294   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2295   %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
   2296   ret <16 x i8> %res
   2297 }
   2298 
   2299 declare <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
   2300 
   2301 define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   2302   ;CHECK-LABEL: test_mask_packs_epi16_rr_256
   2303   ;CHECK: vpacksswb       %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
   2304   %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
   2305   ret <32 x i8> %res
   2306 }
   2307 
   2308 define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
   2309   ;CHECK-LABEL: test_mask_packs_epi16_rrk_256
   2310   ;CHECK: vpacksswb       %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0xd1]
   2311   %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
   2312   ret <32 x i8> %res
   2313 }
   2314 
   2315 define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
   2316   ;CHECK-LABEL: test_mask_packs_epi16_rrkz_256
   2317   ;CHECK: vpacksswb       %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0xc1]
   2318   %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
   2319   ret <32 x i8> %res
   2320 }
   2321 
   2322 define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   2323   ;CHECK-LABEL: test_mask_packs_epi16_rm_256
   2324   ;CHECK: vpacksswb       (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0x07]
   2325   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2326   %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
   2327   ret <32 x i8> %res
   2328 }
   2329 
   2330 define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
   2331   ;CHECK-LABEL: test_mask_packs_epi16_rmk_256
   2332   ;CHECK: vpacksswb       (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0x0f]
   2333   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2334   %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
   2335   ret <32 x i8> %res
   2336 }
   2337 
   2338 define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
   2339   ;CHECK-LABEL: test_mask_packs_epi16_rmkz_256
   2340   ;CHECK: vpacksswb       (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0x07]
   2341   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2342   %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
   2343   ret <32 x i8> %res
   2344 }
   2345 
   2346 declare <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
   2347 
   2348 
   2349 define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
   2350   ;CHECK-LABEL: test_mask_packus_epi32_rr_128
   2351   ;CHECK: vpackusdw       %xmm1, %xmm0, %xmm0 
   2352   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
   2353   ret <8 x i16> %res
   2354 }
   2355 
   2356 define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
   2357   ;CHECK-LABEL: test_mask_packus_epi32_rrk_128
   2358   ;CHECK: vpackusdw       %xmm1, %xmm0, %xmm2 {%k1} 
   2359   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
   2360   ret <8 x i16> %res
   2361 }
   2362 
   2363 define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
   2364   ;CHECK-LABEL: test_mask_packus_epi32_rrkz_128
   2365   ;CHECK: vpackusdw       %xmm1, %xmm0, %xmm0 {%k1} {z} 
   2366   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
   2367   ret <8 x i16> %res
   2368 }
   2369 
   2370 define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
   2371   ;CHECK-LABEL: test_mask_packus_epi32_rm_128
   2372   ;CHECK: vpackusdw       (%rdi), %xmm0, %xmm0 
   2373   %b = load <4 x i32>, <4 x i32>* %ptr_b
   2374   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
   2375   ret <8 x i16> %res
   2376 }
   2377 
   2378 define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2379   ;CHECK-LABEL: test_mask_packus_epi32_rmk_128
   2380   ;CHECK: vpackusdw       (%rdi), %xmm0, %xmm1 {%k1} 
   2381   %b = load <4 x i32>, <4 x i32>* %ptr_b
   2382   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
   2383   ret <8 x i16> %res
   2384 }
   2385 
   2386 define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
   2387   ;CHECK-LABEL: test_mask_packus_epi32_rmkz_128
   2388   ;CHECK: vpackusdw       (%rdi), %xmm0, %xmm0 {%k1} {z} 
   2389   %b = load <4 x i32>, <4 x i32>* %ptr_b
   2390   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
   2391   ret <8 x i16> %res
   2392 }
   2393 
   2394 define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
   2395   ;CHECK-LABEL: test_mask_packus_epi32_rmb_128
   2396   ;CHECK: vpackusdw       (%rdi){1to4}, %xmm0, %xmm0  
   2397   %q = load i32, i32* %ptr_b
   2398   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   2399   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
   2400   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
   2401   ret <8 x i16> %res
   2402 }
   2403 
   2404 define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2405   ;CHECK-LABEL: test_mask_packus_epi32_rmbk_128
   2406   ;CHECK: vpackusdw       (%rdi){1to4}, %xmm0, %xmm1 {%k1} 
   2407   %q = load i32, i32* %ptr_b
   2408   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   2409   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
   2410   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
   2411   ret <8 x i16> %res
   2412 }
   2413 
   2414 define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
   2415   ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_128
   2416   ;CHECK: vpackusdw       (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} 
   2417   %q = load i32, i32* %ptr_b
   2418   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   2419   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
   2420   %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
   2421   ret <8 x i16> %res
   2422 }
   2423 
   2424 declare <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
   2425 
   2426 define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
   2427   ;CHECK-LABEL: test_mask_packus_epi32_rr_256
   2428   ;CHECK: vpackusdw       %ymm1, %ymm0, %ymm0 
   2429   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
   2430   ret <16 x i16> %res
   2431 }
   2432 
   2433 define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
   2434   ;CHECK-LABEL: test_mask_packus_epi32_rrk_256
   2435   ;CHECK: vpackusdw       %ymm1, %ymm0, %ymm2 {%k1} 
   2436   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
   2437   ret <16 x i16> %res
   2438 }
   2439 
   2440 define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
   2441   ;CHECK-LABEL: test_mask_packus_epi32_rrkz_256
   2442   ;CHECK: vpackusdw       %ymm1, %ymm0, %ymm0 {%k1} {z} 
   2443   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
   2444   ret <16 x i16> %res
   2445 }
   2446 
   2447 define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
   2448   ;CHECK-LABEL: test_mask_packus_epi32_rm_256
   2449   ;CHECK: vpackusdw       (%rdi), %ymm0, %ymm0 
   2450   %b = load <8 x i32>, <8 x i32>* %ptr_b
   2451   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
   2452   ret <16 x i16> %res
   2453 }
   2454 
   2455 define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2456   ;CHECK-LABEL: test_mask_packus_epi32_rmk_256
   2457   ;CHECK: vpackusdw       (%rdi), %ymm0, %ymm1 {%k1} 
   2458   %b = load <8 x i32>, <8 x i32>* %ptr_b
   2459   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
   2460   ret <16 x i16> %res
   2461 }
   2462 
   2463 define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
   2464   ;CHECK-LABEL: test_mask_packus_epi32_rmkz_256
   2465   ;CHECK: vpackusdw       (%rdi), %ymm0, %ymm0 {%k1} {z} 
   2466   %b = load <8 x i32>, <8 x i32>* %ptr_b
   2467   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
   2468   ret <16 x i16> %res
   2469 }
   2470 
   2471 define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
   2472   ;CHECK-LABEL: test_mask_packus_epi32_rmb_256
   2473   ;CHECK: vpackusdw       (%rdi){1to8}, %ymm0, %ymm0  
   2474   %q = load i32, i32* %ptr_b
   2475   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   2476   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
   2477   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
   2478   ret <16 x i16> %res
   2479 }
   2480 
   2481 define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2482   ;CHECK-LABEL: test_mask_packus_epi32_rmbk_256
   2483   ;CHECK: vpackusdw       (%rdi){1to8}, %ymm0, %ymm1 {%k1} 
   2484   %q = load i32, i32* %ptr_b
   2485   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   2486   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
   2487   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
   2488   ret <16 x i16> %res
   2489 }
   2490 
   2491 define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
   2492   ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_256
   2493   ;CHECK: vpackusdw       (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} 
   2494   %q = load i32, i32* %ptr_b
   2495   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   2496   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
   2497   %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
   2498   ret <16 x i16> %res
   2499 }
   2500 
   2501 declare <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
   2502 
   2503 define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   2504   ;CHECK-LABEL: test_mask_packus_epi16_rr_128
   2505   ;CHECK: vpackuswb       %xmm1, %xmm0, %xmm0 
   2506   %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
   2507   ret <16 x i8> %res
   2508 }
   2509 
   2510 define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
   2511   ;CHECK-LABEL: test_mask_packus_epi16_rrk_128
   2512   ;CHECK: vpackuswb       %xmm1, %xmm0, %xmm2 {%k1} 
   2513   %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
   2514   ret <16 x i8> %res
   2515 }
   2516 
   2517 define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
   2518   ;CHECK-LABEL: test_mask_packus_epi16_rrkz_128
   2519   ;CHECK: vpackuswb       %xmm1, %xmm0, %xmm0 {%k1} {z} 
   2520   %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
   2521   ret <16 x i8> %res
   2522 }
   2523 
   2524 define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   2525   ;CHECK-LABEL: test_mask_packus_epi16_rm_128
   2526   ;CHECK: vpackuswb       (%rdi), %xmm0, %xmm0 
   2527   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2528   %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
   2529   ret <16 x i8> %res
   2530 }
   2531 
   2532 define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
   2533   ;CHECK-LABEL: test_mask_packus_epi16_rmk_128
   2534   ;CHECK: vpackuswb       (%rdi), %xmm0, %xmm1 {%k1} 
   2535   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2536   %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
   2537   ret <16 x i8> %res
   2538 }
   2539 
   2540 define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
   2541   ;CHECK-LABEL: test_mask_packus_epi16_rmkz_128
   2542   ;CHECK: vpackuswb       (%rdi), %xmm0, %xmm0 {%k1} {z} 
   2543   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2544   %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
   2545   ret <16 x i8> %res
   2546 }
   2547 
   2548 declare <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
   2549 
   2550 define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   2551   ;CHECK-LABEL: test_mask_packus_epi16_rr_256
   2552   ;CHECK: vpackuswb       %ymm1, %ymm0, %ymm0 
   2553   %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
   2554   ret <32 x i8> %res
   2555 }
   2556 
   2557 define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
   2558   ;CHECK-LABEL: test_mask_packus_epi16_rrk_256
   2559   ;CHECK: vpackuswb       %ymm1, %ymm0, %ymm2 {%k1} 
   2560   %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
   2561   ret <32 x i8> %res
   2562 }
   2563 
   2564 define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
   2565   ;CHECK-LABEL: test_mask_packus_epi16_rrkz_256
   2566   ;CHECK: vpackuswb       %ymm1, %ymm0, %ymm0 {%k1} {z} 
   2567   %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
   2568   ret <32 x i8> %res
   2569 }
   2570 
   2571 define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   2572   ;CHECK-LABEL: test_mask_packus_epi16_rm_256
   2573   ;CHECK: vpackuswb       (%rdi), %ymm0, %ymm0 
   2574   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2575   %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
   2576   ret <32 x i8> %res
   2577 }
   2578 
   2579 define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
   2580   ;CHECK-LABEL: test_mask_packus_epi16_rmk_256
   2581   ;CHECK: vpackuswb       (%rdi), %ymm0, %ymm1 {%k1} 
   2582   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2583   %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
   2584   ret <32 x i8> %res
   2585 }
   2586 
   2587 define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
   2588   ;CHECK-LABEL: test_mask_packus_epi16_rmkz_256
   2589   ;CHECK: vpackuswb       (%rdi), %ymm0, %ymm0 {%k1} {z} 
   2590   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2591   %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
   2592   ret <32 x i8> %res
   2593 }
   2594 
   2595 declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
   2596 
   2597 define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   2598   ;CHECK-LABEL: test_mask_adds_epi16_rr_128
   2599   ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0 
   2600   %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2601   ret <8 x i16> %res
   2602 }
   2603 
   2604 define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
   2605   ;CHECK-LABEL: test_mask_adds_epi16_rrk_128
   2606   ;CHECK: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} 
   2607   %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2608   ret <8 x i16> %res
   2609 }
   2610 
   2611 define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
   2612   ;CHECK-LABEL: test_mask_adds_epi16_rrkz_128
   2613   ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} 
   2614   %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2615   ret <8 x i16> %res
   2616 }
   2617 
   2618 define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   2619   ;CHECK-LABEL: test_mask_adds_epi16_rm_128
   2620   ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0 
   2621   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2622   %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2623   ret <8 x i16> %res
   2624 }
   2625 
   2626 define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2627   ;CHECK-LABEL: test_mask_adds_epi16_rmk_128
   2628   ;CHECK: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} 
   2629   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2630   %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2631   ret <8 x i16> %res
   2632 }
   2633 
   2634 define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
   2635   ;CHECK-LABEL: test_mask_adds_epi16_rmkz_128
   2636   ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} 
   2637   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2638   %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2639   ret <8 x i16> %res
   2640 }
   2641 
   2642 declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   2643 
   2644 define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   2645   ;CHECK-LABEL: test_mask_adds_epi16_rr_256
   2646   ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0 
   2647   %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2648   ret <16 x i16> %res
   2649 }
   2650 
   2651 define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
   2652   ;CHECK-LABEL: test_mask_adds_epi16_rrk_256
   2653   ;CHECK: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} 
   2654   %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2655   ret <16 x i16> %res
   2656 }
   2657 
   2658 define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
   2659   ;CHECK-LABEL: test_mask_adds_epi16_rrkz_256
   2660   ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} 
   2661   %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2662   ret <16 x i16> %res
   2663 }
   2664 
   2665 define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   2666   ;CHECK-LABEL: test_mask_adds_epi16_rm_256
   2667   ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0    
   2668   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2669   %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2670   ret <16 x i16> %res
   2671 }
   2672 
   2673 define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2674   ;CHECK-LABEL: test_mask_adds_epi16_rmk_256
   2675   ;CHECK: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} 
   2676   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2677   %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2678   ret <16 x i16> %res
   2679 }
   2680 
   2681 define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
   2682   ;CHECK-LABEL: test_mask_adds_epi16_rmkz_256
   2683   ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} 
   2684   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2685   %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2686   ret <16 x i16> %res
   2687 }
   2688 
   2689 declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   2690 
   2691 define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   2692   ;CHECK-LABEL: test_mask_subs_epi16_rr_128
   2693   ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0     
   2694   %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2695   ret <8 x i16> %res
   2696 }
   2697 
   2698 define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
   2699   ;CHECK-LABEL: test_mask_subs_epi16_rrk_128
   2700   ;CHECK: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} 
   2701   %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2702   ret <8 x i16> %res
   2703 }
   2704 
   2705 define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
   2706   ;CHECK-LABEL: test_mask_subs_epi16_rrkz_128
   2707   ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} 
   2708   %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2709   ret <8 x i16> %res
   2710 }
   2711 
   2712 define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   2713   ;CHECK-LABEL: test_mask_subs_epi16_rm_128
   2714   ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0
   2715   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2716   %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2717   ret <8 x i16> %res
   2718 }
   2719 
   2720 define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2721   ;CHECK-LABEL: test_mask_subs_epi16_rmk_128
   2722   ;CHECK: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} 
   2723   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2724   %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2725   ret <8 x i16> %res
   2726 }
   2727 
   2728 define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
   2729   ;CHECK-LABEL: test_mask_subs_epi16_rmkz_128
   2730   ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} 
   2731   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2732   %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2733   ret <8 x i16> %res
   2734 }
   2735 
   2736 declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   2737 
   2738 define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   2739   ;CHECK-LABEL: test_mask_subs_epi16_rr_256
   2740   ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0     
   2741   %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2742   ret <16 x i16> %res
   2743 }
   2744 
   2745 define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
   2746   ;CHECK-LABEL: test_mask_subs_epi16_rrk_256
   2747   ;CHECK: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} 
   2748   %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2749   ret <16 x i16> %res
   2750 }
   2751 
   2752 define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
   2753   ;CHECK-LABEL: test_mask_subs_epi16_rrkz_256
   2754   ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} 
   2755   %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2756   ret <16 x i16> %res
   2757 }
   2758 
   2759 define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   2760   ;CHECK-LABEL: test_mask_subs_epi16_rm_256
   2761   ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0    
   2762   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2763   %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2764   ret <16 x i16> %res
   2765 }
   2766 
   2767 define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2768   ;CHECK-LABEL: test_mask_subs_epi16_rmk_256
   2769   ;CHECK: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} 
   2770   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2771   %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2772   ret <16 x i16> %res
   2773 }
   2774 
   2775 define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
   2776   ;CHECK-LABEL: test_mask_subs_epi16_rmkz_256
   2777   ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} 
   2778   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2779   %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2780   ret <16 x i16> %res
   2781 }
   2782 
   2783 declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   2784 
   2785 define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   2786   ;CHECK-LABEL: test_mask_adds_epu16_rr_128
   2787   ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0 
   2788   %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2789   ret <8 x i16> %res
   2790 }
   2791 
   2792 define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
   2793   ;CHECK-LABEL: test_mask_adds_epu16_rrk_128
   2794   ;CHECK: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} 
   2795   %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2796   ret <8 x i16> %res
   2797 }
   2798 
   2799 define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
   2800   ;CHECK-LABEL: test_mask_adds_epu16_rrkz_128
   2801   ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} 
   2802   %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2803   ret <8 x i16> %res
   2804 }
   2805 
   2806 define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   2807   ;CHECK-LABEL: test_mask_adds_epu16_rm_128
   2808   ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0 
   2809   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2810   %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2811   ret <8 x i16> %res
   2812 }
   2813 
   2814 define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2815   ;CHECK-LABEL: test_mask_adds_epu16_rmk_128
   2816   ;CHECK: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} 
   2817   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2818   %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2819   ret <8 x i16> %res
   2820 }
   2821 
   2822 define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
   2823   ;CHECK-LABEL: test_mask_adds_epu16_rmkz_128
   2824   ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} 
   2825   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2826   %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2827   ret <8 x i16> %res
   2828 }
   2829 
   2830 declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   2831 
   2832 define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   2833   ;CHECK-LABEL: test_mask_adds_epu16_rr_256
   2834   ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0 
   2835   %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2836   ret <16 x i16> %res
   2837 }
   2838 
   2839 define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
   2840   ;CHECK-LABEL: test_mask_adds_epu16_rrk_256
   2841   ;CHECK: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} 
   2842   %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2843   ret <16 x i16> %res
   2844 }
   2845 
   2846 define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
   2847   ;CHECK-LABEL: test_mask_adds_epu16_rrkz_256
   2848   ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} 
   2849   %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2850   ret <16 x i16> %res
   2851 }
   2852 
   2853 define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   2854   ;CHECK-LABEL: test_mask_adds_epu16_rm_256
   2855   ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0    
   2856   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2857   %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2858   ret <16 x i16> %res
   2859 }
   2860 
   2861 define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2862   ;CHECK-LABEL: test_mask_adds_epu16_rmk_256
   2863   ;CHECK: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} 
   2864   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2865   %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2866   ret <16 x i16> %res
   2867 }
   2868 
   2869 define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
   2870   ;CHECK-LABEL: test_mask_adds_epu16_rmkz_256
   2871   ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} 
   2872   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2873   %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2874   ret <16 x i16> %res
   2875 }
   2876 
   2877 declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   2878 
   2879 define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
   2880   ;CHECK-LABEL: test_mask_subs_epu16_rr_128
   2881   ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0     
   2882   %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2883   ret <8 x i16> %res
   2884 }
   2885 
   2886 define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
   2887   ;CHECK-LABEL: test_mask_subs_epu16_rrk_128
   2888   ;CHECK: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} 
   2889   %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2890   ret <8 x i16> %res
   2891 }
   2892 
   2893 define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
   2894   ;CHECK-LABEL: test_mask_subs_epu16_rrkz_128
   2895   ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} 
   2896   %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2897   ret <8 x i16> %res
   2898 }
   2899 
   2900 define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
   2901   ;CHECK-LABEL: test_mask_subs_epu16_rm_128
   2902   ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0
   2903   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2904   %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
   2905   ret <8 x i16> %res
   2906 }
   2907 
   2908 define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
   2909   ;CHECK-LABEL: test_mask_subs_epu16_rmk_128
   2910   ;CHECK: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} 
   2911   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2912   %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
   2913   ret <8 x i16> %res
   2914 }
   2915 
   2916 define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
   2917   ;CHECK-LABEL: test_mask_subs_epu16_rmkz_128
   2918   ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} 
   2919   %b = load <8 x i16>, <8 x i16>* %ptr_b
   2920   %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
   2921   ret <8 x i16> %res
   2922 }
   2923 
   2924 declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   2925 
   2926 define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
   2927   ;CHECK-LABEL: test_mask_subs_epu16_rr_256
   2928   ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0     
   2929   %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2930   ret <16 x i16> %res
   2931 }
   2932 
   2933 define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
   2934   ;CHECK-LABEL: test_mask_subs_epu16_rrk_256
   2935   ;CHECK: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} 
   2936   %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2937   ret <16 x i16> %res
   2938 }
   2939 
   2940 define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
   2941   ;CHECK-LABEL: test_mask_subs_epu16_rrkz_256
   2942   ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} 
   2943   %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2944   ret <16 x i16> %res
   2945 }
   2946 
   2947 define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
   2948   ;CHECK-LABEL: test_mask_subs_epu16_rm_256
   2949   ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0    
   2950   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2951   %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
   2952   ret <16 x i16> %res
   2953 }
   2954 
   2955 define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
   2956   ;CHECK-LABEL: test_mask_subs_epu16_rmk_256
   2957   ;CHECK: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} 
   2958   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2959   %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
   2960   ret <16 x i16> %res
   2961 }
   2962 
   2963 define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
   2964   ;CHECK-LABEL: test_mask_subs_epu16_rmkz_256
   2965   ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} 
   2966   %b = load <16 x i16>, <16 x i16>* %ptr_b
   2967   %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
   2968   ret <16 x i16> %res
   2969 }
   2970 
   2971 declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   2972 
   2973 define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
   2974   ;CHECK-LABEL: test_mask_adds_epi8_rr_128
   2975   ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0 
   2976   %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
   2977   ret <16 x i8> %res
   2978 }
   2979 
   2980 define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
   2981   ;CHECK-LABEL: test_mask_adds_epi8_rrk_128
   2982   ;CHECK: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} 
   2983   %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
   2984   ret <16 x i8> %res
   2985 }
   2986 
   2987 define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
   2988   ;CHECK-LABEL: test_mask_adds_epi8_rrkz_128
   2989   ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} 
   2990   %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
   2991   ret <16 x i8> %res
   2992 }
   2993 
   2994 define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
   2995   ;CHECK-LABEL: test_mask_adds_epi8_rm_128
   2996   ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0 
   2997   %b = load <16 x i8>, <16 x i8>* %ptr_b
   2998   %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
   2999   ret <16 x i8> %res
   3000 }
   3001 
   3002 define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
   3003   ;CHECK-LABEL: test_mask_adds_epi8_rmk_128
   3004   ;CHECK: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} 
   3005   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3006   %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
   3007   ret <16 x i8> %res
   3008 }
   3009 
   3010 define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
   3011   ;CHECK-LABEL: test_mask_adds_epi8_rmkz_128
   3012   ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} 
   3013   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3014   %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
   3015   ret <16 x i8> %res
   3016 }
   3017 
   3018 declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3019 
   3020 define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
   3021   ;CHECK-LABEL: test_mask_adds_epi8_rr_256
   3022   ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0 
   3023   %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
   3024   ret <32 x i8> %res
   3025 }
   3026 
   3027 define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
   3028   ;CHECK-LABEL: test_mask_adds_epi8_rrk_256
   3029   ;CHECK: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} 
   3030   %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
   3031   ret <32 x i8> %res
   3032 }
   3033 
   3034 define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
   3035   ;CHECK-LABEL: test_mask_adds_epi8_rrkz_256
   3036   ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} 
   3037   %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
   3038   ret <32 x i8> %res
   3039 }
   3040 
   3041 define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
   3042   ;CHECK-LABEL: test_mask_adds_epi8_rm_256
   3043   ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0    
   3044   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3045   %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
   3046   ret <32 x i8> %res
   3047 }
   3048 
   3049 define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
   3050   ;CHECK-LABEL: test_mask_adds_epi8_rmk_256
   3051   ;CHECK: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} 
   3052   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3053   %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
   3054   ret <32 x i8> %res
   3055 }
   3056 
   3057 define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
   3058   ;CHECK-LABEL: test_mask_adds_epi8_rmkz_256
   3059   ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} 
   3060   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3061   %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
   3062   ret <32 x i8> %res
   3063 }
   3064 
   3065 declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3066 
   3067 define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
   3068   ;CHECK-LABEL: test_mask_subs_epi8_rr_128
   3069   ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0     
   3070   %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
   3071   ret <16 x i8> %res
   3072 }
   3073 
   3074 define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
   3075   ;CHECK-LABEL: test_mask_subs_epi8_rrk_128
   3076   ;CHECK: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} 
   3077   %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
   3078   ret <16 x i8> %res
   3079 }
   3080 
   3081 define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
   3082   ;CHECK-LABEL: test_mask_subs_epi8_rrkz_128
   3083   ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} 
   3084   %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
   3085   ret <16 x i8> %res
   3086 }
   3087 
   3088 define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
   3089   ;CHECK-LABEL: test_mask_subs_epi8_rm_128
   3090   ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0
   3091   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3092   %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
   3093   ret <16 x i8> %res
   3094 }
   3095 
   3096 define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
   3097   ;CHECK-LABEL: test_mask_subs_epi8_rmk_128
   3098   ;CHECK: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} 
   3099   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3100   %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
   3101   ret <16 x i8> %res
   3102 }
   3103 
   3104 define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
   3105   ;CHECK-LABEL: test_mask_subs_epi8_rmkz_128
   3106   ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} 
   3107   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3108   %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
   3109   ret <16 x i8> %res
   3110 }
   3111 
   3112 declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3113 
   3114 define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
   3115   ;CHECK-LABEL: test_mask_subs_epi8_rr_256
   3116   ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0     
   3117   %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
   3118   ret <32 x i8> %res
   3119 }
   3120 
   3121 define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
   3122   ;CHECK-LABEL: test_mask_subs_epi8_rrk_256
   3123   ;CHECK: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} 
   3124   %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
   3125   ret <32 x i8> %res
   3126 }
   3127 
   3128 define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
   3129   ;CHECK-LABEL: test_mask_subs_epi8_rrkz_256
   3130   ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} 
   3131   %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
   3132   ret <32 x i8> %res
   3133 }
   3134 
   3135 define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
   3136   ;CHECK-LABEL: test_mask_subs_epi8_rm_256
   3137   ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0    
   3138   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3139   %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
   3140   ret <32 x i8> %res
   3141 }
   3142 
   3143 define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
   3144   ;CHECK-LABEL: test_mask_subs_epi8_rmk_256
   3145   ;CHECK: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} 
   3146   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3147   %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
   3148   ret <32 x i8> %res
   3149 }
   3150 
   3151 define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
   3152   ;CHECK-LABEL: test_mask_subs_epi8_rmkz_256
   3153   ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} 
   3154   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3155   %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
   3156   ret <32 x i8> %res
   3157 }
   3158 
   3159 declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3160 
   3161 define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
   3162   ;CHECK-LABEL: test_mask_adds_epu8_rr_128
   3163   ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0 
   3164   %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
   3165   ret <16 x i8> %res
   3166 }
   3167 
   3168 define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
   3169   ;CHECK-LABEL: test_mask_adds_epu8_rrk_128
   3170   ;CHECK: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} 
   3171   %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
   3172   ret <16 x i8> %res
   3173 }
   3174 
   3175 define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
   3176   ;CHECK-LABEL: test_mask_adds_epu8_rrkz_128
   3177   ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} 
   3178   %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
   3179   ret <16 x i8> %res
   3180 }
   3181 
   3182 define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
   3183   ;CHECK-LABEL: test_mask_adds_epu8_rm_128
   3184   ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0 
   3185   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3186   %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
   3187   ret <16 x i8> %res
   3188 }
   3189 
   3190 define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
   3191   ;CHECK-LABEL: test_mask_adds_epu8_rmk_128
   3192   ;CHECK: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} 
   3193   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3194   %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
   3195   ret <16 x i8> %res
   3196 }
   3197 
   3198 define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
   3199   ;CHECK-LABEL: test_mask_adds_epu8_rmkz_128
   3200   ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} 
   3201   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3202   %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
   3203   ret <16 x i8> %res
   3204 }
   3205 
   3206 declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3207 
   3208 define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
   3209   ;CHECK-LABEL: test_mask_adds_epu8_rr_256
   3210   ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0 
   3211   %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
   3212   ret <32 x i8> %res
   3213 }
   3214 
   3215 define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
   3216   ;CHECK-LABEL: test_mask_adds_epu8_rrk_256
   3217   ;CHECK: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} 
   3218   %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
   3219   ret <32 x i8> %res
   3220 }
   3221 
   3222 define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
   3223   ;CHECK-LABEL: test_mask_adds_epu8_rrkz_256
   3224   ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} 
   3225   %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
   3226   ret <32 x i8> %res
   3227 }
   3228 
   3229 define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
   3230   ;CHECK-LABEL: test_mask_adds_epu8_rm_256
   3231   ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0    
   3232   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3233   %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
   3234   ret <32 x i8> %res
   3235 }
   3236 
   3237 define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
   3238   ;CHECK-LABEL: test_mask_adds_epu8_rmk_256
   3239   ;CHECK: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} 
   3240   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3241   %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
   3242   ret <32 x i8> %res
   3243 }
   3244 
   3245 define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
   3246   ;CHECK-LABEL: test_mask_adds_epu8_rmkz_256
   3247   ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} 
   3248   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3249   %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
   3250   ret <32 x i8> %res
   3251 }
   3252 
   3253 declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3254 
   3255 define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
   3256   ;CHECK-LABEL: test_mask_subs_epu8_rr_128
   3257   ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0     
   3258   %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
   3259   ret <16 x i8> %res
   3260 }
   3261 
   3262 define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
   3263   ;CHECK-LABEL: test_mask_subs_epu8_rrk_128
   3264   ;CHECK: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} 
   3265   %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
   3266   ret <16 x i8> %res
   3267 }
   3268 
   3269 define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
   3270   ;CHECK-LABEL: test_mask_subs_epu8_rrkz_128
   3271   ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} 
   3272   %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
   3273   ret <16 x i8> %res
   3274 }
   3275 
   3276 define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
   3277   ;CHECK-LABEL: test_mask_subs_epu8_rm_128
   3278   ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0
   3279   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3280   %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
   3281   ret <16 x i8> %res
   3282 }
   3283 
   3284 define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
   3285   ;CHECK-LABEL: test_mask_subs_epu8_rmk_128
   3286   ;CHECK: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} 
   3287   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3288   %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
   3289   ret <16 x i8> %res
   3290 }
   3291 
   3292 define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
   3293   ;CHECK-LABEL: test_mask_subs_epu8_rmkz_128
   3294   ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} 
   3295   %b = load <16 x i8>, <16 x i8>* %ptr_b
   3296   %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
   3297   ret <16 x i8> %res
   3298 }
   3299 
   3300 declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3301 
   3302 define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
   3303   ;CHECK-LABEL: test_mask_subs_epu8_rr_256
   3304   ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0     
   3305   %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
   3306   ret <32 x i8> %res
   3307 }
   3308 
   3309 define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
   3310   ;CHECK-LABEL: test_mask_subs_epu8_rrk_256
   3311   ;CHECK: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} 
   3312   %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
   3313   ret <32 x i8> %res
   3314 }
   3315 
   3316 define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
   3317   ;CHECK-LABEL: test_mask_subs_epu8_rrkz_256
   3318   ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} 
   3319   %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
   3320   ret <32 x i8> %res
   3321 }
   3322 
   3323 define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
   3324   ;CHECK-LABEL: test_mask_subs_epu8_rm_256
   3325   ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0    
   3326   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3327   %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
   3328   ret <32 x i8> %res
   3329 }
   3330 
   3331 define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
   3332   ;CHECK-LABEL: test_mask_subs_epu8_rmk_256
   3333   ;CHECK: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} 
   3334   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3335   %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
   3336   ret <32 x i8> %res
   3337 }
   3338 
   3339 define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
   3340   ;CHECK-LABEL: test_mask_subs_epu8_rmkz_256
   3341   ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} 
   3342   %b = load <32 x i8>, <32 x i8>* %ptr_b
   3343   %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
   3344   ret <32 x i8> %res
   3345 }
   3346 
   3347 declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3348 
   3349 declare <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3350 
   3351 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_128
   3352 ; CHECK-NOT: call 
   3353 ; CHECK: vpmaxsb %xmm
   3354 ; CHECK: {%k1} 
   3355 define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
   3356   %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2 ,i16 %mask)
   3357   %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
   3358   %res2 = add <16 x i8> %res, %res1
   3359   ret <16 x i8> %res2
   3360 }
   3361 
   3362 declare <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3363 
   3364 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_256
   3365 ; CHECK-NOT: call 
   3366 ; CHECK: vpmaxsb %ymm
   3367 ; CHECK: {%k1} 
   3368 define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
   3369   %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
   3370   %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   3371   %res2 = add <32 x i8> %res, %res1
   3372   ret <32 x i8> %res2
   3373 }
   3374 
   3375 declare <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3376 
   3377 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_128
   3378 ; CHECK-NOT: call 
   3379 ; CHECK: vpmaxsw %xmm
   3380 ; CHECK: {%k1} 
   3381 define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3382   %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3383   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3384   %res2 = add <8 x i16> %res, %res1
   3385   ret <8 x i16> %res2
   3386 }
   3387 
   3388 declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3389 
   3390 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_256
   3391 ; CHECK-NOT: call 
   3392 ; CHECK: vpmaxsw %ymm
   3393 ; CHECK: {%k1} 
   3394 define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
   3395   %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
   3396   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
   3397   %res2 = add <16 x i16> %res, %res1
   3398   ret <16 x i16> %res2
   3399 }
   3400 
   3401 declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3402 
   3403 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_128
   3404 ; CHECK-NOT: call 
   3405 ; CHECK: vpmaxub %xmm
   3406 ; CHECK: {%k1} 
   3407 define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2,i16 %mask) {
   3408   %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
   3409   %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
   3410   %res2 = add <16 x i8> %res, %res1
   3411   ret <16 x i8> %res2
   3412 }
   3413 
   3414 declare <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3415 
   3416 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_256
   3417 ; CHECK-NOT: call 
   3418 ; CHECK: vpmaxub %ymm
   3419 ; CHECK: {%k1} 
   3420 define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
   3421   %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
   3422   %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   3423   %res2 = add <32 x i8> %res, %res1
   3424   ret <32 x i8> %res2
   3425 }
   3426 
   3427 declare <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3428 
   3429 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_128
   3430 ; CHECK-NOT: call 
   3431 ; CHECK: vpmaxuw %xmm
   3432 ; CHECK: {%k1} 
   3433 define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3434   %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3435   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3436   %res2 = add <8 x i16> %res, %res1
   3437   ret <8 x i16> %res2
   3438 }
   3439 
   3440 declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3441 
   3442 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_256
   3443 ; CHECK-NOT: call 
   3444 ; CHECK: vpmaxuw %ymm
   3445 ; CHECK: {%k1} 
   3446 define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
   3447   %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
   3448   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
   3449   %res2 = add <16 x i16> %res, %res1
   3450   ret <16 x i16> %res2
   3451 }
   3452 
   3453 declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3454 
   3455 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_128
   3456 ; CHECK-NOT: call 
   3457 ; CHECK: vpminsb %xmm
   3458 ; CHECK: {%k1} 
   3459 define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
   3460   %res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
   3461   %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
   3462   %res2 = add <16 x i8> %res, %res1
   3463   ret <16 x i8> %res2
   3464 }
   3465 
   3466 declare <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3467 
   3468 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_256
   3469 ; CHECK-NOT: call 
   3470 ; CHECK: vpminsb %ymm
   3471 ; CHECK: {%k1} 
   3472 define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
   3473   %res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
   3474   %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   3475   %res2 = add <32 x i8> %res, %res1
   3476   ret <32 x i8> %res2
   3477 }
   3478 
   3479 declare <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3480 
   3481 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_128
   3482 ; CHECK-NOT: call 
   3483 ; CHECK: vpminsw %xmm
   3484 ; CHECK: {%k1} 
   3485 define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3486   %res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3487   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3488   %res2 = add <8 x i16> %res, %res1
   3489   ret <8 x i16> %res2
   3490 }
   3491 
   3492 declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3493 
   3494 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_256
   3495 ; CHECK-NOT: call 
   3496 ; CHECK: vpminsw %ymm
   3497 ; CHECK: {%k1} 
   3498 define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
   3499   %res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
   3500   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
   3501   %res2 = add <16 x i16> %res, %res1
   3502   ret <16 x i16> %res2
   3503 }
   3504 
   3505 declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3506 
   3507 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_128
   3508 ; CHECK-NOT: call 
   3509 ; CHECK: vpminub %xmm
   3510 ; CHECK: {%k1} 
   3511 define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
   3512   %res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
   3513   %res1 = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
   3514   %res2 = add <16 x i8> %res, %res1
   3515   ret <16 x i8> %res2
   3516 }
   3517 
   3518 declare <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3519 
   3520 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_256
   3521 ; CHECK-NOT: call 
   3522 ; CHECK: vpminub %ymm
   3523 ; CHECK: {%k1} 
   3524 define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
   3525   %res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
   3526   %res1 = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   3527   %res2 = add <32 x i8> %res, %res1
   3528   ret <32 x i8> %res2
   3529 }
   3530 
   3531 declare <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3532 
   3533 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_128
   3534 ; CHECK-NOT: call 
   3535 ; CHECK: vpminuw %xmm
   3536 ; CHECK: {%k1} 
   3537 define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3538   %res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3539   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3540   %res2 = add <8 x i16> %res, %res1
   3541   ret <8 x i16> %res2
   3542 }
   3543 
   3544 declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3545 
   3546 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_256
   3547 ; CHECK-NOT: call 
   3548 ; CHECK: vpminuw %ymm
   3549 ; CHECK: {%k1} 
   3550 define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
   3551   %res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
   3552   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
   3553   %res2 = add <16 x i16> %res, %res1
   3554   ret <16 x i16> %res2
   3555 }
   3556 
   3557 declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3558 
   3559 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_128
   3560 ; CHECK-NOT: call 
   3561 ; CHECK: kmov 
   3562 ; CHECK: vpermt2w %xmm{{.*}}{%k1} 
   3563 ; CHECK-NOT: {z}
   3564 define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3565   %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3566   %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3567   %res2 = add <8 x i16> %res, %res1
   3568   ret <8 x i16> %res2
   3569 }
   3570 
   3571 declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3572 
   3573 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_128
   3574 ; CHECK-NOT: call 
   3575 ; CHECK: kmov 
   3576 ; CHECK: vpermt2w %xmm{{.*}}{%k1} {z}
   3577 define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3578   %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3579   %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3580   %res2 = add <8 x i16> %res, %res1
   3581   ret <8 x i16> %res2
   3582 }
   3583 
   3584 declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3585 
   3586 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_256
   3587 ; CHECK-NOT: call 
   3588 ; CHECK: kmov 
   3589 ; CHECK: vpermt2w %ymm{{.*}}{%k1} 
   3590 define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   3591   %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   3592   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   3593   %res2 = add <16 x i16> %res, %res1
   3594   ret <16 x i16> %res2
   3595 }
   3596 
   3597 declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3598 
   3599 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_256
   3600 ; CHECK-NOT: call 
   3601 ; CHECK: kmov 
   3602 ; CHECK: vpermt2w %ymm{{.*}}{%k1} {z}
   3603 define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   3604   %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   3605   %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   3606   %res2 = add <16 x i16> %res, %res1
   3607   ret <16 x i16> %res2
   3608 }
   3609 
   3610 declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3611 
   3612 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_128
   3613 ; CHECK-NOT: call 
   3614 ; CHECK: kmov 
   3615 ; CHECK: vpermi2w %xmm{{.*}}{%k1} 
   3616 define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3617   %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3618   %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3619   %res2 = add <8 x i16> %res, %res1
   3620   ret <8 x i16> %res2
   3621 }
   3622 
   3623 declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3624 
   3625 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_256
   3626 ; CHECK-NOT: call 
   3627 ; CHECK: kmov 
   3628 ; CHECK: vpermi2w %ymm{{.*}}{%k1} 
   3629 define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   3630   %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   3631   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   3632   %res2 = add <16 x i16> %res, %res1
   3633   ret <16 x i16> %res2
   3634 }
   3635 
   3636 declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3637 
   3638 ; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_128
   3639 ; CHECK-NOT: call 
   3640 ; CHECK: vpavgb %xmm
   3641 ; CHECK: {%k1} 
   3642 define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
   3643   %res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
   3644   %res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
   3645   %res2 = add <16 x i8> %res, %res1
   3646   ret <16 x i8> %res2
   3647 }
   3648 
   3649 declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3650 
   3651 ; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_256
   3652 ; CHECK-NOT: call 
   3653 ; CHECK: vpavgb %ymm
   3654 ; CHECK: {%k1} 
   3655 define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
   3656   %res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
   3657   %res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   3658   %res2 = add <32 x i8> %res, %res1
   3659   ret <32 x i8> %res2
   3660 }
   3661 
   3662 declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3663 
   3664 ; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_128
   3665 ; CHECK-NOT: call 
   3666 ; CHECK: vpavgw %xmm
   3667 ; CHECK: {%k1} 
   3668 define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3669   %res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3670   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3671   %res2 = add <8 x i16> %res, %res1
   3672   ret <8 x i16> %res2
   3673 }
   3674 
   3675 declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3676 
   3677 ; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_256
   3678 ; CHECK-NOT: call 
   3679 ; CHECK: vpavgw %ymm
   3680 ; CHECK: {%k1} 
   3681 define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   3682   %res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   3683   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   3684   %res2 = add <16 x i16> %res, %res1
   3685   ret <16 x i16> %res2
   3686 }
   3687 
   3688 declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   3689 
   3690 ; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_128
   3691 ; CHECK-NOT: call 
   3692 ; CHECK: kmov 
   3693 ; CHECK: vpshufb %xmm{{.*}}{%k1} 
   3694 define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
   3695   %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
   3696   %res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
   3697   %res2 = add <16 x i8> %res, %res1
   3698   ret <16 x i8> %res2
   3699 }
   3700 
   3701 declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   3702 
   3703 ; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_256
   3704 ; CHECK-NOT: call 
   3705 ; CHECK: kmov 
   3706 ; CHECK: vpshufb %ymm{{.*}}{%k1} 
   3707 define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
   3708   %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
   3709   %res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   3710   %res2 = add <32 x i8> %res, %res1
   3711   ret <32 x i8> %res2
   3712 }
   3713 
   3714 declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
   3715 
   3716 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_128
   3717 ; CHECK-NOT: call 
   3718 ; CHECK: kmov 
   3719 ; CHECK: vpabsb{{.*}}{%k1} 
   3720 define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
   3721   %res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
   3722   %res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
   3723   %res2 = add <16 x i8> %res, %res1
   3724   ret <16 x i8> %res2
   3725 }
   3726 
   3727 declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32)
   3728 
   3729 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_256
   3730 ; CHECK-NOT: call 
   3731 ; CHECK: kmov 
   3732 ; CHECK: vpabsb{{.*}}{%k1} 
   3733 define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
   3734   %res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
   3735   %res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1)
   3736   %res2 = add <32 x i8> %res, %res1
   3737   ret <32 x i8> %res2
   3738 }
   3739 
   3740 declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)
   3741 
   3742 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_128
   3743 ; CHECK-NOT: call 
   3744 ; CHECK: kmov 
   3745 ; CHECK: vpabsw{{.*}}{%k1} 
   3746 define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
   3747   %res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
   3748   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
   3749   %res2 = add <8 x i16> %res, %res1
   3750   ret <8 x i16> %res2
   3751 }
   3752 
   3753 declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)
   3754 
   3755 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_256
   3756 ; CHECK-NOT: call 
   3757 ; CHECK: kmov 
   3758 ; CHECK: vpabsw{{.*}}{%k1} 
   3759 define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
   3760   %res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
   3761   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1)
   3762   %res2 = add <16 x i16> %res, %res1
   3763   ret <16 x i16> %res2
   3764 }
   3765 
   3766 ; CHECK-LABEL: test_x86_mask_blend_b_256
   3767 ; CHECK: vpblendmb
   3768 define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) {
   3769   %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1]
   3770   ret <32 x i8> %res
   3771 }
   3772 declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly
   3773 
   3774 ; CHECK-LABEL: test_x86_mask_blend_w_256
   3775 define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) {
   3776   ; CHECK: vpblendmw
   3777   %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1]
   3778   ret <16 x i16> %res
   3779 }
   3780 declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly
   3781 
   3782 ; CHECK-LABEL: test_x86_mask_blend_b_128
   3783 ; CHECK: vpblendmb
   3784 define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) {
   3785   %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1]
   3786   ret <16 x i8> %res
   3787 }
   3788 declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly
   3789 
   3790 ; CHECK-LABEL: test_x86_mask_blend_w_128
   3791 define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) {
   3792   ; CHECK: vpblendmw
   3793   %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1]
   3794   ret <8 x i16> %res
   3795 }
   3796 declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly
   3797 
   3798 declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3799 
   3800 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128
   3801 ; CHECK-NOT: call 
   3802 ; CHECK: kmov 
   3803 ; CHECK: {%k1} 
   3804 ; CHECK: vpmulhuw {{.*}}encoding: [0x62
   3805 define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3806   %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3807   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3808   %res2 = add <8 x i16> %res, %res1
   3809   ret <8 x i16> %res2
   3810 }
   3811 
   3812 declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3813 
   3814 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_256
   3815 ; CHECK-NOT: call 
   3816 ; CHECK: kmov 
   3817 ; CHECK: {%k1} 
   3818 ; CHECK: vpmulhuw {{.*}}encoding: [0x62
   3819 define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   3820   %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   3821   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   3822   %res2 = add <16 x i16> %res, %res1
   3823   ret <16 x i16> %res2
   3824 }
   3825 
   3826 declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3827 
   3828 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_128
   3829 ; CHECK-NOT: call 
   3830 ; CHECK: kmov 
   3831 ; CHECK: {%k1} 
   3832 ; CHECK: vpmulhw {{.*}}encoding: [0x62
   3833 define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3834   %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3835   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3836   %res2 = add <8 x i16> %res, %res1
   3837   ret <8 x i16> %res2
   3838 }
   3839 
   3840 declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3841 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_256
   3842 ; CHECK-NOT: call 
   3843 ; CHECK: kmov 
   3844 ; CHECK: {%k1} 
   3845 ; CHECK: vpmulhw {{.*}}encoding: [0x62
   3846 define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   3847   %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   3848   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   3849   %res2 = add <16 x i16> %res, %res1
   3850   ret <16 x i16> %res2
   3851 }
   3852 
   3853 declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   3854 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_128
   3855 ; CHECK-NOT: call 
   3856 ; CHECK: kmov 
   3857 ; CHECK: {%k1} 
   3858 ; CHECK: vpmulhrsw {{.*}}encoding: [0x62
   3859 define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   3860   %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   3861   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   3862   %res2 = add <8 x i16> %res, %res1
   3863   ret <8 x i16> %res2
   3864 }
   3865 
   3866 declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   3867 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_256
   3868 ; CHECK-NOT: call 
   3869 ; CHECK: kmov 
   3870 ; CHECK: {%k1} 
   3871 ; CHECK: vpmulhrsw {{.*}}encoding: [0x62
   3872 define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   3873   %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   3874   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   3875   %res2 = add <16 x i16> %res, %res1
   3876   ret <16 x i16> %res2
   3877 }
   3878 
   3879 declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
   3880 
   3881 define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
   3882 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
   3883 ; CHECK:       vpmovwb %xmm0, %xmm1 {%k1}
   3884 ; CHECK-NEXT:  vpmovwb %xmm0, %xmm2 {%k1} {z}
   3885 ; CHECK-NEXT:  vpmovwb %xmm0, %xmm0
   3886     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
   3887     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
   3888     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
   3889     %res3 = add <16 x i8> %res0, %res1
   3890     %res4 = add <16 x i8> %res3, %res2
   3891     ret <16 x i8> %res4
   3892 }
   3893 
   3894 declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
   3895 
   3896 define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
   3897 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
   3898 ; CHECK:  vpmovwb %xmm0, (%rdi)
   3899 ; CHECK:  vpmovwb %xmm0, (%rdi) {%k1}
   3900     call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
   3901     call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
   3902     ret void
   3903 }
   3904 
   3905 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
   3906 
   3907 define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
   3908 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
   3909 ; CHECK:       vpmovswb %xmm0, %xmm1 {%k1}
   3910 ; CHECK-NEXT:  vpmovswb %xmm0, %xmm2 {%k1} {z}
   3911 ; CHECK-NEXT:  vpmovswb %xmm0, %xmm0
   3912     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
   3913     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
   3914     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
   3915     %res3 = add <16 x i8> %res0, %res1
   3916     %res4 = add <16 x i8> %res3, %res2
   3917     ret <16 x i8> %res4
   3918 }
   3919 
   3920 declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
   3921 
   3922 define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
   3923 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
   3924 ; CHECK:  vpmovswb %xmm0, (%rdi)
   3925 ; CHECK:  vpmovswb %xmm0, (%rdi) {%k1}
   3926     call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
   3927     call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
   3928     ret void
   3929 }
   3930 
   3931 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
   3932 
   3933 define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
   3934 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
   3935 ; CHECK:       vpmovuswb %xmm0, %xmm1 {%k1}
   3936 ; CHECK-NEXT:  vpmovuswb %xmm0, %xmm2 {%k1} {z}
   3937 ; CHECK-NEXT:  vpmovuswb %xmm0, %xmm0
   3938     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
   3939     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
   3940     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
   3941     %res3 = add <16 x i8> %res0, %res1
   3942     %res4 = add <16 x i8> %res3, %res2
   3943     ret <16 x i8> %res4
   3944 }
   3945 
   3946 declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
   3947 
   3948 define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
   3949 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
   3950 ; CHECK:  vpmovuswb %xmm0, (%rdi)
   3951 ; CHECK:  vpmovuswb %xmm0, (%rdi) {%k1}
   3952     call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
   3953     call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
   3954     ret void
   3955 }
   3956 
   3957 declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
   3958 
   3959 define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
   3960 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
   3961 ; CHECK:       vpmovwb %ymm0, %xmm1 {%k1}
   3962 ; CHECK-NEXT:  vpmovwb %ymm0, %xmm2 {%k1} {z}
   3963 ; CHECK-NEXT:  vpmovwb %ymm0, %xmm0
   3964     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
   3965     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
   3966     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
   3967     %res3 = add <16 x i8> %res0, %res1
   3968     %res4 = add <16 x i8> %res3, %res2
   3969     ret <16 x i8> %res4
   3970 }
   3971 
   3972 declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)
   3973 
   3974 define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
   3975 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
   3976 ; CHECK:  vpmovwb %ymm0, (%rdi)
   3977 ; CHECK:  vpmovwb %ymm0, (%rdi) {%k1}
   3978     call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
   3979     call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
   3980     ret void
   3981 }
   3982 
   3983 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
   3984 
   3985 define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
   3986 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
   3987 ; CHECK:       vpmovswb %ymm0, %xmm1 {%k1}
   3988 ; CHECK-NEXT:  vpmovswb %ymm0, %xmm2 {%k1} {z}
   3989 ; CHECK-NEXT:  vpmovswb %ymm0, %xmm0
   3990     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
   3991     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
   3992     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
   3993     %res3 = add <16 x i8> %res0, %res1
   3994     %res4 = add <16 x i8> %res3, %res2
   3995     ret <16 x i8> %res4
   3996 }
   3997 
   3998 declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)
   3999 
   4000 define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
   4001 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
   4002 ; CHECK:  vpmovswb %ymm0, (%rdi)
   4003 ; CHECK:  vpmovswb %ymm0, (%rdi) {%k1}
   4004     call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
   4005     call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
   4006     ret void
   4007 }
   4008 
   4009 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
   4010 
   4011 define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
   4012 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
   4013 ; CHECK:       vpmovuswb %ymm0, %xmm1 {%k1}
   4014 ; CHECK-NEXT:  vpmovuswb %ymm0, %xmm2 {%k1} {z}
   4015 ; CHECK-NEXT:  vpmovuswb %ymm0, %xmm0
   4016     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
   4017     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
   4018     %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
   4019     %res3 = add <16 x i8> %res0, %res1
   4020     %res4 = add <16 x i8> %res3, %res2
   4021     ret <16 x i8> %res4
   4022 }
   4023 
   4024 declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)
   4025 
   4026 define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
   4027 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
   4028 ; CHECK:  vpmovuswb %ymm0, (%rdi)
   4029 ; CHECK:  vpmovuswb %ymm0, (%rdi) {%k1}
   4030     call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
   4031     call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
   4032     ret void
   4033 }
   4034 
   4035 declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
   4036 
   4037 define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
   4038 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
   4039 ; CHECK:       ## BB#0:
   4040 ; CHECK-NEXT:    movzbl %dil, %eax
   4041 ; CHECK-NEXT:    kmovw %eax, %k1
   4042 ; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm2 {%k1}
   4043 ; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   4044 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
   4045 ; CHECK-NEXT:    retq
   4046   %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3)
   4047   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1)
   4048   %res2 = add <4 x i32> %res, %res1
   4049   ret <4 x i32> %res2
   4050 }
   4051 
   4052 declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
   4053 
   4054 define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
   4055 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
   4056 ; CHECK:       ## BB#0:
   4057 ; CHECK-NEXT:    movzbl %dil, %eax
   4058 ; CHECK-NEXT:    kmovw %eax, %k1
   4059 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm2 {%k1}
   4060 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
   4061 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
   4062 ; CHECK-NEXT:    retq
   4063   %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3)
   4064   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1)
   4065   %res2 = add <8 x i32> %res, %res1
   4066   ret <8 x i32> %res2
   4067 }
   4068 
   4069 declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
   4070 
   4071 define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
   4072 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
   4073 ; CHECK:       ## BB#0:
   4074 ; CHECK-NEXT:    movzbl %dil, %eax
   4075 ; CHECK-NEXT:    kmovw %eax, %k1
   4076 ; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1}
   4077 ; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
   4078 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
   4079 ; CHECK-NEXT:    retq
   4080   %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3)
   4081   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1)
   4082   %res2 = add <8 x i16> %res, %res1
   4083   ret <8 x i16> %res2
   4084 }
   4085 
   4086 declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
   4087 
   4088 define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
   4089 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
   4090 ; CHECK:       ## BB#0:
   4091 ; CHECK-NEXT:    kmovw %edi, %k1
   4092 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1}
   4093 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
   4094 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
   4095 ; CHECK-NEXT:    retq
   4096   %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3)
   4097   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1)
   4098   %res2 = add <16 x i16> %res, %res1
   4099   ret <16 x i16> %res2
   4100 }
   4101 
   4102 declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   4103 
   4104 define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
   4105 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128:
   4106 ; CHECK:         vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1}
   4107 ; CHECK-NEXT:    ## xmm2 = xmm2[8],k1[8],xmm2[9],k1[9],xmm2[10],k1[10],xmm2[11],k1[11],xmm2[12],k1[12],xmm2[13],k1[13],xmm2[14],k1[14],xmm2[15],k1[15]
   4108 ; CHECK-NEXT:    vpunpckhbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x68,0xc1]
   4109 ; CHECK-NEXT:    ## xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
   4110   %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
   4111   %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
   4112   %res2 = add <16 x i8> %res, %res1
   4113   ret <16 x i8> %res2
   4114 }
   4115 
   4116 declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
   4117 
   4118 define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
   4119 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128:
   4120 ; CHECK:         vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1}
   4121 ; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3],xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
   4122 ; CHECK-NEXT:    vpunpcklbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x60,0xc1]
   4123 ; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   4124   %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
   4125   %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
   4126   %res2 = add <16 x i8> %res, %res1
   4127   ret <16 x i8> %res2
   4128 }
   4129 
   4130 declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   4131 
   4132 define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
   4133 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256:
   4134 ; CHECK:         vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1}
   4135 ; CHECK-NEXT:    ## ymm2 = ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15],ymm2[24],k1[24],ymm2[25],k1[25],ymm2[26],k1[26],ymm2[27],k1[27],ymm2[28],k1[28],ymm2[29],k1[29],ymm2[30],k1[30],ymm2[31],k1[31]
   4136 ; CHECK-NEXT:    vpunpckhbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x68,0xc1]
   4137 ; CHECK-NEXT:    ## ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
   4138   %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
   4139   %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   4140   %res2 = add <32 x i8> %res, %res1
   4141   ret <32 x i8> %res2
   4142 }
   4143 
   4144 declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
   4145 
   4146 define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
   4147 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256:
   4148 ; CHECK:         vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1}
   4149 ; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[16],k1[16],ymm2[17],k1[17],ymm2[18],k1[18],ymm2[19],k1[19],ymm2[20],k1[20],ymm2[21],k1[21],ymm2[22],k1[22],ymm2[23],k1[23]
   4150 ; CHECK-NEXT:    vpunpcklbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x60,0xc1]
   4151 ; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
   4152   %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
   4153   %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
   4154   %res2 = add <32 x i8> %res, %res1
   4155   ret <32 x i8> %res2
   4156 }
   4157 
   4158 declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   4159 
   4160 define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   4161 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128:
   4162 ; CHECK:         vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1}
   4163 ; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3]
   4164 ; CHECK-NEXT:    vpunpcklwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x61,0xc1]
   4165 ; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   4166   %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   4167   %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   4168   %res2 = add <8 x i16> %res, %res1
   4169   ret <8 x i16> %res2
   4170 }
   4171 
   4172 declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
   4173 
   4174 define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
   4175 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128:
   4176 ; CHECK:         vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1}
   4177 ; CHECK-NEXT:    ## xmm2 = xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
   4178 ; CHECK-NEXT:    vpunpckhwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x69,0xc1]
   4179 ; CHECK-NEXT:    ## xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   4180   %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   4181   %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
   4182   %res2 = add <8 x i16> %res, %res1
   4183   ret <8 x i16> %res2
   4184 }
   4185 
   4186 declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   4187 
   4188 define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   4189 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256:
   4190 ; CHECK:         vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1}
   4191 ; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11]
   4192 ; CHECK-NEXT:    vpunpcklwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x61,0xc1]
   4193 ; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
   4194   %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   4195   %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   4196   %res2 = add <16 x i16> %res, %res1
   4197   ret <16 x i16> %res2
   4198 }
   4199 
   4200 declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
   4201 
   4202 define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
   4203 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256:
   4204 ; CHECK:         vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1}
   4205 ; CHECK-NEXT:    ## ymm2 = ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15]
   4206 ; CHECK-NEXT:    vpunpckhwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x69,0xc1]
   4207 ; CHECK-NEXT:    ## ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
   4208   %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   4209   %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
   4210   %res2 = add <16 x i16> %res, %res1
   4211   ret <16 x i16> %res2
   4212 }
   4213 
   4214 declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16)
   4215 
   4216 define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) {
   4217 ; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128:
   4218 ; CHECK:       ## BB#0:
   4219 ; CHECK-NEXT:    kmovw %edi, %k1
   4220 ; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1}
   4221 ; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
   4222 ; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm0
   4223 ; CHECK-NEXT:    vpaddb %xmm3, %xmm2, %xmm1
   4224 ; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
   4225 ; CHECK-NEXT:    retq
   4226   %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
   4227   %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
   4228   %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1)
   4229   %res3 = add <16 x i8> %res, %res1
   4230   %res4 = add <16 x i8> %res3, %res2
   4231   ret <16 x i8> %res4
   4232 }
   4233 
   4234 declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32)
   4235 
   4236 define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) {
   4237 ; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256:
   4238 ; CHECK:       ## BB#0:
   4239 ; CHECK-NEXT:    kmovd %edi, %k1
   4240 ; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1}
   4241 ; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
   4242 ; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm0
   4243 ; CHECK-NEXT:    vpaddb %ymm3, %ymm2, %ymm1
   4244 ; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
   4245 ; CHECK-NEXT:    retq
   4246   %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
   4247   %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
   4248   %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1)
   4249   %res3 = add <32 x i8> %res, %res1
   4250   %res4 = add <32 x i8> %res3, %res2
   4251   ret <32 x i8> %res4
   4252 }
   4253 
   4254 declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8)
   4255 
   4256 define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
   4257 ; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
   4258 ; CHECK:       ## BB#0:
   4259 ; CHECK-NEXT:    movzbl %dil, %eax
   4260 ; CHECK-NEXT:    kmovw %eax, %k1
   4261 ; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1}
   4262 ; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
   4263 ; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm0
   4264 ; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1
   4265 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   4266 ; CHECK-NEXT:    retq
   4267   %res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
   4268   %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4)
   4269   %res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 -1)
   4270   %res3 = add <8 x i16> %res, %res1
   4271   %res4 = add <8 x i16> %res2, %res3
   4272   ret <8 x i16> %res4
   4273 }
   4274 
   4275 declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32, <16 x i16>, i16)
   4276 
   4277 define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
   4278 ; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
   4279 ; CHECK:       ## BB#0:
   4280 ; CHECK-NEXT:    kmovw %edi, %k1
   4281 ; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1}
   4282 ; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
   4283 ; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm0
   4284 ; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1
   4285 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
   4286 ; CHECK-NEXT:    retq
   4287   %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
   4288   %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4)
   4289   %res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 -1)
   4290   %res3 = add <16 x i16> %res, %res1
   4291   %res4 = add <16 x i16> %res3, %res2
   4292   ret <16 x i16> %res4
   4293 }
   4294 
   4295 declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
   4296 
   4297 define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) {
   4298 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256:
   4299 ; CHECK:       ## BB#0:
   4300 ; CHECK-NEXT:    kmovd %edi, %k1
   4301 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm1 {%k1}
   4302 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm2 {%k1} {z}
   4303 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
   4304 ; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   4305 ; CHECK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
   4306 ; CHECK-NEXT:    retq
   4307   %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
   4308   %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
   4309   %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask)
   4310   %res3 = add <32 x i8> %res, %res1
   4311   %res4 = add <32 x i8> %res2, %res3
   4312   ret <32 x i8> %res4
   4313 }
   4314 
   4315 declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16)
   4316 
   4317 define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) {
   4318 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
   4319 ; CHECK:       ## BB#0:
   4320 ; CHECK-NEXT:    kmovw %edi, %k1
   4321 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm1 {%k1}
   4322 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm2 {%k1} {z}
   4323 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
   4324 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   4325 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
   4326 ; CHECK-NEXT:    retq
   4327   %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
   4328   %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
   4329   %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask)
   4330   %res3 = add <16 x i8> %res, %res1
   4331   %res4 = add <16 x i8> %res2, %res3
   4332   ret <16 x i8> %res4
   4333 }
   4334 
   4335 declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16)
   4336 
   4337 define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) {
   4338 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
   4339 ; CHECK:       ## BB#0:
   4340 ; CHECK-NEXT:    kmovw %edi, %k1
   4341 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1 {%k1}
   4342 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm2 {%k1} {z}
   4343 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
   4344 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
   4345 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
   4346 ; CHECK-NEXT:    retq
   4347   %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
   4348   %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
   4349   %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask)
   4350   %res3 = add <16 x i16> %res, %res1
   4351   %res4 = add <16 x i16> %res2, %res3
   4352   ret <16 x i16> %res4
   4353 }
   4354 
   4355 declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8)
   4356 
   4357 define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
   4358 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
   4359 ; CHECK:       ## BB#0:
   4360 ; CHECK-NEXT:    movzbl %dil, %eax
   4361 ; CHECK-NEXT:    kmovw %eax, %k1
   4362 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm1 {%k1}
   4363 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm2 {%k1} {z}
   4364 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
   4365 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   4366 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
   4367 ; CHECK-NEXT:    retq
   4368   %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
   4369   %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
   4370   %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask)
   4371   %res3 = add <8 x i16> %res, %res1
   4372   %res4 = add <8 x i16> %res2, %res3
   4373   ret <8 x i16> %res4
   4374 }
   4375 
   4376 declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64)
   4377 
   4378 define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) {
   4379 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512:
   4380 ; CHECK:       ## BB#0:
   4381 ; CHECK-NEXT:    kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
   4382 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
   4383 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xd0]
   4384 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xc0]
   4385 ; CHECK-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
   4386 ; CHECK-NEXT:    vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
   4387 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   4388   %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
   4389   %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
   4390   %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask)
   4391   %res3 = add <64 x i8> %res, %res1
   4392   %res4 = add <64 x i8> %res2, %res3
   4393   ret <64 x i8> %res4
   4394 }
   4395 
   4396 declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32)
   4397 
   4398 define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) {
   4399 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512:
   4400 ; CHECK:       ## BB#0:
   4401 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
   4402 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
   4403 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xd0]
   4404 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xc0]
   4405 ; CHECK-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
   4406 ; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
   4407 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   4408   %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
   4409   %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
   4410   %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask)
   4411   %res3 = add <32 x i16> %res, %res1
   4412   %res4 = add <32 x i16> %res2, %res3
   4413   ret <32 x i16> %res4
   4414 }
   4415 
   4416