Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding| FileCheck %s
      2 
      3 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
      4 ; CHECK-LABEL: test_pcmpeq_b
      5 ; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
      6   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
      7   ret i64 %res
      8 }
      9 
     10 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
     11 ; CHECK-LABEL: test_mask_pcmpeq_b
     12 ; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
     13   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
     14   ret i64 %res
     15 }
     16 
     17 declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
     18 
     19 define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
     20 ; CHECK-LABEL: test_pcmpeq_w
     21 ; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
     22   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
     23   ret i32 %res
     24 }
     25 
     26 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
     27 ; CHECK-LABEL: test_mask_pcmpeq_w
     28 ; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
     29   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
     30   ret i32 %res
     31 }
     32 
     33 declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
     34 
     35 define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
     36 ; CHECK-LABEL: test_pcmpgt_b
     37 ; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ##
     38   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
     39   ret i64 %res
     40 }
     41 
     42 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
     43 ; CHECK-LABEL: test_mask_pcmpgt_b
     44 ; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ##
     45   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
     46   ret i64 %res
     47 }
     48 
     49 declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
     50 
     51 define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
     52 ; CHECK-LABEL: test_pcmpgt_w
     53 ; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ##
     54   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
     55   ret i32 %res
     56 }
     57 
     58 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
     59 ; CHECK-LABEL: test_mask_pcmpgt_w
     60 ; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ##
     61   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
     62   ret i32 %res
     63 }
     64 
     65 declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
     66 
     67 define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
     68 ; CHECK_LABEL: test_cmp_b_512
     69 ; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
     70   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 0, i64 -1)
     71   %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
     72 ; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
     73   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 1, i64 -1)
     74   %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
     75 ; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
     76   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 2, i64 -1)
     77   %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
     78 ; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
     79   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 3, i64 -1)
     80   %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
     81 ; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
     82   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 4, i64 -1)
     83   %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
     84 ; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
     85   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 5, i64 -1)
     86   %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
     87 ; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
     88   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 6, i64 -1)
     89   %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
     90 ; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
     91   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 7, i64 -1)
     92   %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
     93   ret <8 x i64> %vec7
     94 }
     95 
     96 define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
     97 ; CHECK_LABEL: test_mask_cmp_b_512
     98 ; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
     99   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 0, i64 %mask)
    100   %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
    101 ; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
    102   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 1, i64 %mask)
    103   %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
    104 ; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
    105   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 2, i64 %mask)
    106   %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
    107 ; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
    108   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 3, i64 %mask)
    109   %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
    110 ; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
    111   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 4, i64 %mask)
    112   %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
    113 ; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
    114   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 5, i64 %mask)
    115   %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
    116 ; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
    117   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 6, i64 %mask)
    118   %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
    119 ; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
    120   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 7, i64 %mask)
    121   %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
    122   ret <8 x i64> %vec7
    123 }
    124 
    125 declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i8, i64) nounwind readnone
    126 
    127 define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
    128 ; CHECK_LABEL: test_ucmp_b_512
    129 ; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
    130   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 0, i64 -1)
    131   %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
    132 ; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
    133   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 1, i64 -1)
    134   %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
    135 ; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
    136   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 2, i64 -1)
    137   %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
    138 ; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
    139   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 3, i64 -1)
    140   %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
    141 ; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
    142   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 4, i64 -1)
    143   %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
    144 ; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
    145   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 5, i64 -1)
    146   %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
    147 ; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
    148   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 6, i64 -1)
    149   %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
    150 ; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
    151   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 7, i64 -1)
    152   %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
    153   ret <8 x i64> %vec7
    154 }
    155 
    156 define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
    157 ; CHECK_LABEL: test_mask_ucmp_b_512
    158 ; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
    159   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 0, i64 %mask)
    160   %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
    161 ; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
    162   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 1, i64 %mask)
    163   %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
    164 ; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
    165   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 2, i64 %mask)
    166   %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
    167 ; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
    168   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 3, i64 %mask)
    169   %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
    170 ; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
    171   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 4, i64 %mask)
    172   %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
    173 ; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
    174   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 5, i64 %mask)
    175   %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
    176 ; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
    177   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 6, i64 %mask)
    178   %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
    179 ; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
    180   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i8 7, i64 %mask)
    181   %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
    182   ret <8 x i64> %vec7
    183 }
    184 
    185 declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i8, i64) nounwind readnone
    186 
    187 define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
    188 ; CHECK_LABEL: test_cmp_w_512
    189 ; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
    190   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 0, i32 -1)
    191   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
    192 ; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
    193   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 1, i32 -1)
    194   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
    195 ; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
    196   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 2, i32 -1)
    197   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
    198 ; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
    199   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 3, i32 -1)
    200   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
    201 ; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
    202   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 4, i32 -1)
    203   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
    204 ; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
    205   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 5, i32 -1)
    206   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
    207 ; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
    208   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 6, i32 -1)
    209   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
    210 ; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
    211   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 7, i32 -1)
    212   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
    213   ret <8 x i32> %vec7
    214 }
    215 
    216 define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
    217 ; CHECK_LABEL: test_mask_cmp_w_512
    218 ; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
    219   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 0, i32 %mask)
    220   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
    221 ; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
    222   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 1, i32 %mask)
    223   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
    224 ; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
    225   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 2, i32 %mask)
    226   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
    227 ; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
    228   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 3, i32 %mask)
    229   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
    230 ; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
    231   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 4, i32 %mask)
    232   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
    233 ; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
    234   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 5, i32 %mask)
    235   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
    236 ; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
    237   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 6, i32 %mask)
    238   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
    239 ; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
    240   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 7, i32 %mask)
    241   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
    242   ret <8 x i32> %vec7
    243 }
    244 
    245 declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i8, i32) nounwind readnone
    246 
    247 define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
    248 ; CHECK_LABEL: test_ucmp_w_512
    249 ; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
    250   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 0, i32 -1)
    251   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
    252 ; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
    253   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 1, i32 -1)
    254   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
    255 ; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
    256   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 2, i32 -1)
    257   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
    258 ; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
    259   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 3, i32 -1)
    260   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
    261 ; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
    262   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 4, i32 -1)
    263   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
    264 ; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
    265   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 5, i32 -1)
    266   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
    267 ; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
    268   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 6, i32 -1)
    269   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
    270 ; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
    271   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 7, i32 -1)
    272   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
    273   ret <8 x i32> %vec7
    274 }
    275 
    276 define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
    277 ; CHECK_LABEL: test_mask_ucmp_w_512
    278 ; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
    279   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 0, i32 %mask)
    280   %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
    281 ; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
    282   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 1, i32 %mask)
    283   %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
    284 ; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
    285   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 2, i32 %mask)
    286   %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
    287 ; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
    288   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 3, i32 %mask)
    289   %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
    290 ; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
    291   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 4, i32 %mask)
    292   %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
    293 ; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
    294   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 5, i32 %mask)
    295   %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
    296 ; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
    297   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 6, i32 %mask)
    298   %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
    299 ; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
    300   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i8 7, i32 %mask)
    301   %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
    302   ret <8 x i32> %vec7
    303 }
    304 
    305 declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i8, i32) nounwind readnone
    306 
    307 ; CHECK-LABEL: test_x86_mask_blend_b_256
    308 ; CHECK: vpblendmb
    309 define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) {
    310   %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1]
    311   ret <32 x i8> %res
    312 }
    313 declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly
    314 
    315 ; CHECK-LABEL: test_x86_mask_blend_w_256
    316 define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) {
    317   ; CHECK: vpblendmw
    318   %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1]
    319   ret <16 x i16> %res
    320 }
    321 declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly
    322 
    323 ; CHECK-LABEL: test_x86_mask_blend_b_512
    324 ; CHECK: vpblendmb
    325 define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
    326   %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
    327   ret <64 x i8> %res
    328 }
    329 declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly
    330 
    331 ; CHECK-LABEL: test_x86_mask_blend_w_512
    332 define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) {
    333   ; CHECK: vpblendmw
    334   %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
    335   ret <32 x i16> %res
    336 }
    337 declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly
    338 
    339 ; CHECK-LABEL: test_x86_mask_blend_b_128
    340 ; CHECK: vpblendmb
    341 define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) {
    342   %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1]
    343   ret <16 x i8> %res
    344 }
    345 declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly
    346 
    347 ; CHECK-LABEL: test_x86_mask_blend_w_128
    348 define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) {
    349   ; CHECK: vpblendmw
    350   %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1]
    351   ret <8 x i16> %res
    352 }
    353 declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly
    354