Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
      3 
      4 declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
      5 
      6 define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
      7 ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
      8 ; CHECK:       ## BB#0:
      9 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm2
     10 ; CHECK-NEXT:    kmovw %edi, %k1
     11 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
     12 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
     13 ; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
     14 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
     15 ; CHECK-NEXT:    retq
     16 
     17   %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
     18   %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
     19   %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
     20   %res3 = fadd <16 x float> %res, %res1
     21   %res4 = fadd <16 x float> %res2, %res3
     22   ret <16 x float> %res4
     23 }
     24 
     25 declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
     26 
     27 define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
     28 ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
     29 ; CHECK:       ## BB#0:
     30 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm2
     31 ; CHECK-NEXT:    kmovw %edi, %k1
     32 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
     33 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
     34 ; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
     35 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
     36 ; CHECK-NEXT:    retq
     37 
     38   %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
     39   %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
     40   %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
     41   %res3 = fadd <8 x double> %res, %res1
     42   %res4 = fadd <8 x double> %res2, %res3
     43   ret <8 x double> %res4
     44 }
     45 
     46 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
     47 
     48 define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
     49 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
     50 ; CHECK:       ## BB#0:
     51 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2
     52 ; CHECK-NEXT:    kmovw %edi, %k1
     53 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
     54 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
     55 ; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
     56 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
     57 ; CHECK-NEXT:    retq
     58   %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
     59   %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
     60   %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
     61   %res3 = add <16 x i32> %res, %res1
     62   %res4 = add <16 x i32> %res2, %res3
     63   ret <16 x i32> %res4
     64 }
     65 
     66 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
     67 
     68 define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
     69 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
     70 ; CHECK:       ## BB#0:
     71 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2
     72 ; CHECK-NEXT:    kmovw %edi, %k1
     73 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
     74 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
     75 ; CHECK-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
     76 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
     77 ; CHECK-NEXT:    retq
     78   %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
     79   %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
     80   %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
     81   %res3 = add <8 x i64> %res, %res1
     82   %res4 = add <8 x i64> %res2, %res3
     83   ret <8 x i64> %res4
     84 }
     85 
     86 declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
     87 
     88 define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
     89 ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
     90 ; CHECK:       ## BB#0:
     91 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
     92 ; CHECK-NEXT:    kmovw %edi, %k1
     93 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
     94 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
     95 ; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
     96 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
     97 ; CHECK-NEXT:    retq
     98   %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
     99   %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
    100   %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
    101   %res3 = fadd <16 x float> %res, %res1
    102   %res4 = fadd <16 x float> %res2, %res3
    103   ret <16 x float> %res4
    104 }
    105 
    106 declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
    107 
    108 define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
    109 ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
    110 ; CHECK:       ## BB#0:
    111 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    112 ; CHECK-NEXT:    kmovw %edi, %k1
    113 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    114 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    115 ; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
    116 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
    117 ; CHECK-NEXT:    retq
    118   %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
    119   %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
    120   %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
    121   %res3 = fadd <16 x float> %res, %res1
    122   %res4 = fadd <16 x float> %res2, %res3
    123   ret <16 x float> %res4
    124 }
    125 
    126 declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
    127 
    128 define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
    129 ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
    130 ; CHECK:       ## BB#0:
    131 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
    132 ; CHECK-NEXT:    kmovw %edi, %k1
    133 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
    134 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
    135 ; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
    136 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
    137 ; CHECK-NEXT:    retq
    138   %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
    139   %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
    140   %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
    141   %res3 = fadd <8 x double> %res, %res1
    142   %res4 = fadd <8 x double> %res2, %res3
    143   ret <8 x double> %res4
    144 }
    145 
    146 declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8)
    147 
    148 define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) {
    149 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512:
    150 ; CHECK:       ## BB#0:
    151 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
    152 ; CHECK-NEXT:    kmovw %esi, %k1
    153 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
    154 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
    155 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
    156 ; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
    157 ; CHECK-NEXT:    retq
    158   %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3)
    159   %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3)
    160   %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1)
    161   %res3 = fadd <8 x double> %res, %res1
    162   %res4 = fadd <8 x double> %res3, %res2
    163   ret <8 x double> %res4
    164 }
    165 
    166 declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8)
    167 
    168 define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
    169 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512:
    170 ; CHECK:       ## BB#0:
    171 ; CHECK-NEXT:    vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
    172 ; CHECK-NEXT:    kmovw %esi, %k1
    173 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
    174 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
    175 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
    176 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    177 ; CHECK-NEXT:    retq
    178   %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
    179   %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
    180   %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
    181   %res3 = add <8 x i64> %res, %res1
    182   %res4 = add <8 x i64> %res3, %res2
    183   ret <8 x i64> %res4
    184 }
    185 
    186 define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
    187 ; CHECK-LABEL: test_store1:
    188 ; CHECK:       ## BB#0:
    189 ; CHECK-NEXT:    kmovw %edx, %k1
    190 ; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
    191 ; CHECK-NEXT:    vmovups %zmm0, (%rsi)
    192 ; CHECK-NEXT:    retq
    193   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
    194   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
    195   ret void
    196 }
    197 
    198 declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
    199 
    200 define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
    201 ; CHECK-LABEL: test_store2:
    202 ; CHECK:       ## BB#0:
    203 ; CHECK-NEXT:    kmovw %edx, %k1
    204 ; CHECK-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
    205 ; CHECK-NEXT:    vmovupd %zmm0, (%rsi)
    206 ; CHECK-NEXT:    retq
    207   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
    208   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
    209   ret void
    210 }
    211 
    212 declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
    213 
    214 define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
    215 ; CHECK-LABEL: test_mask_store_aligned_ps:
    216 ; CHECK:       ## BB#0:
    217 ; CHECK-NEXT:    kmovw %edx, %k1
    218 ; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
    219 ; CHECK-NEXT:    vmovaps %zmm0, (%rsi)
    220 ; CHECK-NEXT:    retq
    221   call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
    222   call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
    223   ret void
    224 }
    225 
    226 declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
    227 
    228 define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
    229 ; CHECK-LABEL: test_mask_store_aligned_pd:
    230 ; CHECK:       ## BB#0:
    231 ; CHECK-NEXT:    kmovw %edx, %k1
    232 ; CHECK-NEXT:    vmovapd %zmm0, (%rdi) {%k1}
    233 ; CHECK-NEXT:    vmovapd %zmm0, (%rsi)
    234 ; CHECK-NEXT:    retq
    235   call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
    236   call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
    237   ret void
    238 }
    239 
    240 declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
    241 
    242 define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
    243 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512:
    244 ; CHECK:       ## BB#0:
    245 ; CHECK-NEXT:    kmovw %edx, %k1
    246 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%rdi) {%k1}
    247 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%rsi)
    248 ; CHECK-NEXT:    retq
    249   call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
    250   call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
    251   ret void
    252 }
    253 
    254 declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
    255 
    256 define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
    257 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512:
    258 ; CHECK:       ## BB#0:
    259 ; CHECK-NEXT:    kmovw %edx, %k1
    260 ; CHECK-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
    261 ; CHECK-NEXT:    vmovdqu32 %zmm0, (%rsi)
    262 ; CHECK-NEXT:    retq
    263   call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
    264   call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
    265   ret void
    266 }
    267 
    268 declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
    269 
    270 define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
    271 ; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512:
    272 ; CHECK:       ## BB#0:
    273 ; CHECK-NEXT:    kmovw %edx, %k1
    274 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdi) {%k1}
    275 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
    276 ; CHECK-NEXT:    retq
    277   call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
    278   call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
    279   ret void
    280 }
    281 
    282 declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8)
    283 
    284 define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
    285 ; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512:
    286 ; CHECK:       ## BB#0:
    287 ; CHECK-NEXT:    kmovw %edx, %k1
    288 ; CHECK-NEXT:    vmovdqa32 %zmm0, (%rdi) {%k1}
    289 ; CHECK-NEXT:    vmovdqa32 %zmm0, (%rsi)
    290 ; CHECK-NEXT:    retq
    291   call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
    292   call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
    293   ret void
    294 }
    295 
    296 declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16)
    297 
    298 define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
    299 ; CHECK-LABEL: test_mask_load_aligned_ps:
    300 ; CHECK:       ## BB#0:
    301 ; CHECK-NEXT:    vmovaps (%rdi), %zmm0
    302 ; CHECK-NEXT:    kmovw %esi, %k1
    303 ; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1}
    304 ; CHECK-NEXT:    vmovaps (%rdi), %zmm1 {%k1} {z}
    305 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
    306 ; CHECK-NEXT:    retq
    307   %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
    308   %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
    309   %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
    310   %res4 = fadd <16 x float> %res2, %res1
    311   ret <16 x float> %res4
    312 }
    313 
    314 declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
    315 
    316 define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
    317 ; CHECK-LABEL: test_mask_load_unaligned_ps:
    318 ; CHECK:       ## BB#0:
    319 ; CHECK-NEXT:    vmovups (%rdi), %zmm0
    320 ; CHECK-NEXT:    kmovw %esi, %k1
    321 ; CHECK-NEXT:    vmovups (%rdi), %zmm0 {%k1}
    322 ; CHECK-NEXT:    vmovups (%rdi), %zmm1 {%k1} {z}
    323 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
    324 ; CHECK-NEXT:    retq
    325   %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
    326   %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
    327   %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
    328   %res4 = fadd <16 x float> %res2, %res1
    329   ret <16 x float> %res4
    330 }
    331 
    332 declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
    333 
    334 define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
    335 ; CHECK-LABEL: test_mask_load_aligned_pd:
    336 ; CHECK:       ## BB#0:
    337 ; CHECK-NEXT:    vmovapd (%rdi), %zmm0
    338 ; CHECK-NEXT:    kmovw %esi, %k1
    339 ; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1}
    340 ; CHECK-NEXT:    vmovapd (%rdi), %zmm1 {%k1} {z}
    341 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
    342 ; CHECK-NEXT:    retq
    343   %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
    344   %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
    345   %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
    346   %res4 = fadd <8 x double> %res2, %res1
    347   ret <8 x double> %res4
    348 }
    349 
    350 declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
    351 
    352 define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
    353 ; CHECK-LABEL: test_mask_load_unaligned_pd:
    354 ; CHECK:       ## BB#0:
    355 ; CHECK-NEXT:    vmovupd (%rdi), %zmm0
    356 ; CHECK-NEXT:    kmovw %esi, %k1
    357 ; CHECK-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
    358 ; CHECK-NEXT:    vmovupd (%rdi), %zmm1 {%k1} {z}
    359 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
    360 ; CHECK-NEXT:    retq
    361   %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
    362   %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
    363   %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
    364   %res4 = fadd <8 x double> %res2, %res1
    365   ret <8 x double> %res4
    366 }
    367 
    368 declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
    369 
    370 declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
    371 
    372 define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
    373 ; CHECK-LABEL: test_mask_load_unaligned_d:
    374 ; CHECK:       ## BB#0:
    375 ; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm0
    376 ; CHECK-NEXT:    kmovw %edx, %k1
    377 ; CHECK-NEXT:    vmovdqu32 (%rsi), %zmm0 {%k1}
    378 ; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm1 {%k1} {z}
    379 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    380 ; CHECK-NEXT:    retq
    381   %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
    382   %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask)
    383   %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
    384   %res4 = add <16 x i32> %res2, %res1
    385   ret <16 x i32> %res4
    386 }
    387 
    388 declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
    389 
    390 define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
    391 ; CHECK-LABEL: test_mask_load_unaligned_q:
    392 ; CHECK:       ## BB#0:
    393 ; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0
    394 ; CHECK-NEXT:    kmovw %edx, %k1
    395 ; CHECK-NEXT:    vmovdqu64 (%rsi), %zmm0 {%k1}
    396 ; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1} {z}
    397 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
    398 ; CHECK-NEXT:    retq
    399   %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
    400   %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask)
    401   %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
    402   %res4 = add <8 x i64> %res2, %res1
    403   ret <8 x i64> %res4
    404 }
    405 
    406 declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8*, <16 x i32>, i16)
    407 
    408 define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, i8* %ptr, i16 %mask) {
    409 ; CHECK-LABEL: test_mask_load_aligned_d:
    410 ; CHECK:       ## BB#0:
    411 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0
    412 ; CHECK-NEXT:    kmovw %esi, %k1
    413 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1}
    414 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1 {%k1} {z}
    415 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    416 ; CHECK-NEXT:    retq
    417   %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
    418   %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> %res, i16 %mask)
    419   %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
    420   %res4 = add <16 x i32> %res2, %res1
    421   ret <16 x i32> %res4
    422 }
    423 
    424 declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8*, <8 x i64>, i8)
    425 
    426 define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) {
    427 ; CHECK-LABEL: test_mask_load_aligned_q:
    428 ; CHECK:       ## BB#0:
    429 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
    430 ; CHECK-NEXT:    kmovw %esi, %k1
    431 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1}
    432 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 {%k1} {z}
    433 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
    434 ; CHECK-NEXT:    retq
    435   %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
    436   %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> %res, i8 %mask)
    437   %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
    438   %res4 = add <8 x i64> %res2, %res1
    439   ret <8 x i64> %res4
    440 }
    441 
    442 declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
    443 
    444 define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
    445 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
    446 ; CHECK:       ## BB#0:
    447 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm2 = zmm0[0,1,3,2,5,4,6,6]
    448 ; CHECK-NEXT:    kmovw %edi, %k1
    449 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
    450 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6]
    451 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
    452 ; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
    453 ; CHECK-NEXT:    retq
    454   %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
    455   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
    456   %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
    457   %res3 = fadd <8 x double> %res, %res1
    458   %res4 = fadd <8 x double> %res3, %res2
    459   ret <8 x double> %res4
    460 }
    461 
    462 declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
    463 
    464 define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
    465 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
    466 ; CHECK:       ## BB#0:
    467 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
    468 ; CHECK-NEXT:    kmovw %edi, %k1
    469 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
    470 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
    471 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
    472 ; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
    473 ; CHECK-NEXT:    retq
    474   %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
    475   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
    476   %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
    477   %res3 = fadd <16 x float> %res, %res1
    478   %res4 = fadd <16 x float> %res3, %res2
    479   ret <16 x float> %res4
    480 }
    481 
    482 declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16)
    483 
    484 define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
    485 ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
    486 ; CHECK:       ## BB#0:
    487 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
    488 ; CHECK-NEXT:    kmovw %esi, %k1
    489 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
    490 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
    491 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    492 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
    493 ; CHECK-NEXT:    retq
    494 	%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
    495 	%res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
    496 	%res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
    497 	%res3 = add <16 x i32> %res, %res1
    498 	%res4 = add <16 x i32> %res3, %res2
    499 	ret <16 x i32> %res4
    500 }
    501 
    502 define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
    503 ; CHECK-LABEL: test_pcmpeq_d:
    504 ; CHECK:       ## BB#0:
    505 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
    506 ; CHECK-NEXT:    kmovw %k0, %eax
    507 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
    508 ; CHECK-NEXT:    retq
    509   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
    510   ret i16 %res
    511 }
    512 
    513 define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
    514 ; CHECK-LABEL: test_mask_pcmpeq_d:
    515 ; CHECK:       ## BB#0:
    516 ; CHECK-NEXT:    kmovw %edi, %k1
    517 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
    518 ; CHECK-NEXT:    kmovw %k0, %eax
    519 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
    520 ; CHECK-NEXT:    retq
    521   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
    522   ret i16 %res
    523 }
    524 
    525 declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
    526 
    527 define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
    528 ; CHECK-LABEL: test_pcmpeq_q:
    529 ; CHECK:       ## BB#0:
    530 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
    531 ; CHECK-NEXT:    kmovw %k0, %eax
    532 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
    533 ; CHECK-NEXT:    retq
    534   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
    535   ret i8 %res
    536 }
    537 
    538 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
    539 ; CHECK-LABEL: test_mask_pcmpeq_q:
    540 ; CHECK:       ## BB#0:
    541 ; CHECK-NEXT:    kmovw %edi, %k1
    542 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
    543 ; CHECK-NEXT:    kmovw %k0, %eax
    544 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
    545 ; CHECK-NEXT:    retq
    546   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
    547   ret i8 %res
    548 }
    549 
    550 declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
    551 
    552 define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
    553 ; CHECK-LABEL: test_pcmpgt_d:
    554 ; CHECK:       ## BB#0:
    555 ; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
    556 ; CHECK-NEXT:    kmovw %k0, %eax
    557 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
    558 ; CHECK-NEXT:    retq
    559   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
    560   ret i16 %res
    561 }
    562 
    563 define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
    564 ; CHECK-LABEL: test_mask_pcmpgt_d:
    565 ; CHECK:       ## BB#0:
    566 ; CHECK-NEXT:    kmovw %edi, %k1
    567 ; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
    568 ; CHECK-NEXT:    kmovw %k0, %eax
    569 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
    570 ; CHECK-NEXT:    retq
    571   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
    572   ret i16 %res
    573 }
    574 
    575 declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
    576 
    577 define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
    578 ; CHECK-LABEL: test_pcmpgt_q:
    579 ; CHECK:       ## BB#0:
    580 ; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
    581 ; CHECK-NEXT:    kmovw %k0, %eax
    582 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
    583 ; CHECK-NEXT:    retq
    584   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
    585   ret i8 %res
    586 }
    587 
    588 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
    589 ; CHECK-LABEL: test_mask_pcmpgt_q:
    590 ; CHECK:       ## BB#0:
    591 ; CHECK-NEXT:    kmovw %edi, %k1
    592 ; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
    593 ; CHECK-NEXT:    kmovw %k0, %eax
    594 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
    595 ; CHECK-NEXT:    retq
    596   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
    597   ret i8 %res
    598 }
    599 
    600 declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
    601 
    602 declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
    603 
    604 define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
    605 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
    606 ; CHECK:       ## BB#0:
    607 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
    608 ; CHECK-NEXT:    kmovw %edi, %k1
    609 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
    610 ; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm0
    611 ; CHECK-NEXT:    retq
    612   %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
    613   %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
    614   %res2 = fadd <8 x double> %res, %res1
    615   ret <8 x double> %res2
    616 }
    617 
    618 declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
    619 
    620 define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
    621 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
    622 ; CHECK:       ## BB#0:
    623 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    624 ; CHECK-NEXT:    kmovw %edi, %k1
    625 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    626 ; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
    627 ; CHECK-NEXT:    retq
    628   %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
    629   %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
    630   %res2 = fadd <16 x float> %res, %res1
    631   ret <16 x float> %res2
    632 }
    633 
    634 declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
    635 
    636 define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
    637 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
    638 ; CHECK:       ## BB#0:
    639 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
    640 ; CHECK-NEXT:    kmovw %edi, %k1
    641 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
    642 ; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm0
    643 ; CHECK-NEXT:    retq
    644   %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
    645   %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
    646   %res2 = fadd <8 x double> %res, %res1
    647   ret <8 x double> %res2
    648 }
    649 
    650 declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
    651 
    652 define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
    653 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
    654 ; CHECK:       ## BB#0:
    655 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    656 ; CHECK-NEXT:    kmovw %edi, %k1
    657 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    658 ; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
    659 ; CHECK-NEXT:    retq
    660   %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
    661   %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
    662   %res2 = fadd <16 x float> %res, %res1
    663   ret <16 x float> %res2
    664 }
    665 
    666 declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
    667 
    668 define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
    669 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
    670 ; CHECK:       ## BB#0:
    671 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
    672 ; CHECK-NEXT:    kmovw %edi, %k1
    673 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
    674 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
    675 ; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
    676 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    677 ; CHECK-NEXT:    retq
    678   %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
    679   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
    680   %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
    681   %res3 = add <8 x i64> %res, %res1
    682   %res4 = add <8 x i64> %res2, %res3
    683   ret <8 x i64> %res4
    684 }
    685 
    686 declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
    687 
    688 define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
    689 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
    690 ; CHECK:       ## BB#0:
    691 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
    692 ; CHECK-NEXT:    kmovw %edi, %k1
    693 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
    694 ; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm0
    695 ; CHECK-NEXT:    retq
    696   %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
    697   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
    698   %res2 = add <8 x i64> %res, %res1
    699   ret <8 x i64> %res2
    700 }
    701 
    702 declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
    703 
    704 define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
    705 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
    706 ; CHECK:       ## BB#0:
    707 ; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    708 ; CHECK-NEXT:    kmovw %edi, %k1
    709 ; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    710 ; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
    711 ; CHECK-NEXT:    retq
    712   %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
    713   %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
    714   %res2 = add <16 x i32> %res, %res1
    715   ret <16 x i32> %res2
    716 }
    717 
    718 declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
    719 
    720 define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
    721 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
    722 ; CHECK:       ## BB#0:
    723 ; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    724 ; CHECK-NEXT:    kmovw %edi, %k1
    725 ; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    726 ; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
    727 ; CHECK-NEXT:    retq
    728   %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
    729   %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
    730   %res2 = add <16 x i32> %res, %res1
    731   ret <16 x i32> %res2
    732 }
    733 
    734 define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
    735 ; CHECK-LABEL: test_x86_avx512_pslli_d:
    736 ; CHECK:       ## BB#0:
    737 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0
    738 ; CHECK-NEXT:    retq
    739   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
    740   ret <16 x i32> %res
    741 }
    742 
    743 define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
    744 ; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
    745 ; CHECK:       ## BB#0:
    746 ; CHECK-NEXT:    kmovw %edi, %k1
    747 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1}
    748 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    749 ; CHECK-NEXT:    retq
    750   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
    751   ret <16 x i32> %res
    752 }
    753 
    754 define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
    755 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
    756 ; CHECK:       ## BB#0:
    757 ; CHECK-NEXT:    kmovw %edi, %k1
    758 ; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z}
    759 ; CHECK-NEXT:    retq
    760   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
    761   ret <16 x i32> %res
    762 }
    763 
    764 declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
    765 
    766 define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
    767 ; CHECK-LABEL: test_x86_avx512_pslli_q:
    768 ; CHECK:       ## BB#0:
    769 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0
    770 ; CHECK-NEXT:    retq
    771   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
    772   ret <8 x i64> %res
    773 }
    774 
    775 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
    776 ; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
    777 ; CHECK:       ## BB#0:
    778 ; CHECK-NEXT:    kmovw %edi, %k1
    779 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1}
    780 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    781 ; CHECK-NEXT:    retq
    782   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
    783   ret <8 x i64> %res
    784 }
    785 
    786 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
    787 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
    788 ; CHECK:       ## BB#0:
    789 ; CHECK-NEXT:    kmovw %edi, %k1
    790 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z}
    791 ; CHECK-NEXT:    retq
    792   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
    793   ret <8 x i64> %res
    794 }
    795 
    796 declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
    797 
    798 define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
    799 ; CHECK-LABEL: test_x86_avx512_psrli_d:
    800 ; CHECK:       ## BB#0:
    801 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0
    802 ; CHECK-NEXT:    retq
    803   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
    804   ret <16 x i32> %res
    805 }
    806 
    807 define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
    808 ; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
    809 ; CHECK:       ## BB#0:
    810 ; CHECK-NEXT:    kmovw %edi, %k1
    811 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1}
    812 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    813 ; CHECK-NEXT:    retq
    814   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
    815   ret <16 x i32> %res
    816 }
    817 
    818 define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
    819 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
    820 ; CHECK:       ## BB#0:
    821 ; CHECK-NEXT:    kmovw %edi, %k1
    822 ; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z}
    823 ; CHECK-NEXT:    retq
    824   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
    825   ret <16 x i32> %res
    826 }
    827 
    828 declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
    829 
    830 define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
    831 ; CHECK-LABEL: test_x86_avx512_psrli_q:
    832 ; CHECK:       ## BB#0:
    833 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0
    834 ; CHECK-NEXT:    retq
    835   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
    836   ret <8 x i64> %res
    837 }
    838 
    839 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
    840 ; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
    841 ; CHECK:       ## BB#0:
    842 ; CHECK-NEXT:    kmovw %edi, %k1
    843 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1}
    844 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    845 ; CHECK-NEXT:    retq
    846   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
    847   ret <8 x i64> %res
    848 }
    849 
    850 define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
    851 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
    852 ; CHECK:       ## BB#0:
    853 ; CHECK-NEXT:    kmovw %edi, %k1
    854 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
    855 ; CHECK-NEXT:    retq
    856   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
    857   ret <8 x i64> %res
    858 }
    859 
    860 declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
    861 
    862 define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
    863 ; CHECK-LABEL: test_x86_avx512_psrai_d:
    864 ; CHECK:       ## BB#0:
    865 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0
    866 ; CHECK-NEXT:    retq
    867   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
    868   ret <16 x i32> %res
    869 }
    870 
    871 define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
    872 ; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
    873 ; CHECK:       ## BB#0:
    874 ; CHECK-NEXT:    kmovw %edi, %k1
    875 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1}
    876 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    877 ; CHECK-NEXT:    retq
    878   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
    879   ret <16 x i32> %res
    880 }
    881 
    882 define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
    883 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
    884 ; CHECK:       ## BB#0:
    885 ; CHECK-NEXT:    kmovw %edi, %k1
    886 ; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z}
    887 ; CHECK-NEXT:    retq
    888   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
    889   ret <16 x i32> %res
    890 }
    891 
    892 declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
    893 
    894 define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
    895 ; CHECK-LABEL: test_x86_avx512_psrai_q:
    896 ; CHECK:       ## BB#0:
    897 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0
    898 ; CHECK-NEXT:    retq
    899   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
    900   ret <8 x i64> %res
    901 }
    902 
    903 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
    904 ; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
    905 ; CHECK:       ## BB#0:
    906 ; CHECK-NEXT:    kmovw %edi, %k1
    907 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1}
    908 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    909 ; CHECK-NEXT:    retq
    910   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
    911   ret <8 x i64> %res
    912 }
    913 
    914 define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
    915 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
    916 ; CHECK:       ## BB#0:
    917 ; CHECK-NEXT:    kmovw %edi, %k1
    918 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z}
    919 ; CHECK-NEXT:    retq
    920   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
    921   ret <8 x i64> %res
    922 }
    923 
    924 declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
    925 
    926 declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>)
    927 
    928 define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) {
    929 ; CHECK-LABEL: test_storent_q_512:
    930 ; CHECK:       ## BB#0:
    931 ; CHECK-NEXT:    vmovntdq %zmm0, (%rdi)
    932 ; CHECK-NEXT:    retq
    933   call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
    934   ret void
    935 }
    936 
    937 declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>)
    938 
    939 define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
    940 ; CHECK-LABEL: test_storent_pd_512:
    941 ; CHECK:       ## BB#0:
    942 ; CHECK-NEXT:    vmovntpd %zmm0, (%rdi)
    943 ; CHECK-NEXT:    retq
    944   call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
    945   ret void
    946 }
    947 
    948 declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>)
    949 
    950 define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
    951 ; CHECK-LABEL: test_storent_ps_512:
    952 ; CHECK:       ## BB#0:
    953 ; CHECK-NEXT:    vmovntps %zmm0, (%rdi)
    954 ; CHECK-NEXT:    retq
    955   call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
    956   ret void
    957 }
    958 
    959 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
    960 ; CHECK-LABEL: test_xor_epi32:
    961 ; CHECK:       ## BB#0:
    962 ; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm0
    963 ; CHECK-NEXT:    retq
    964   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
    965   ret < 16 x i32> %res
    966 }
    967 
    968 define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
    969 ; CHECK-LABEL: test_mask_xor_epi32:
    970 ; CHECK:       ## BB#0:
    971 ; CHECK-NEXT:    kmovw %edi, %k1
    972 ; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm2 {%k1}
    973 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
    974 ; CHECK-NEXT:    retq
    975   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
    976   ret < 16 x i32> %res
    977 }
    978 
    979 declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
    980 
    981 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
    982 ; CHECK-LABEL: test_or_epi32:
    983 ; CHECK:       ## BB#0:
    984 ; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
    985 ; CHECK-NEXT:    retq
    986   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
    987   ret < 16 x i32> %res
    988 }
    989 
    990 define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
    991 ; CHECK-LABEL: test_mask_or_epi32:
    992 ; CHECK:       ## BB#0:
    993 ; CHECK-NEXT:    kmovw %edi, %k1
    994 ; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm2 {%k1}
    995 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
    996 ; CHECK-NEXT:    retq
    997   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
    998   ret < 16 x i32> %res
    999 }
   1000 
   1001 declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   1002 
   1003 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
   1004 ; CHECK-LABEL: test_and_epi32:
   1005 ; CHECK:       ## BB#0:
   1006 ; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm0
   1007 ; CHECK-NEXT:    retq
   1008   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   1009   ret < 16 x i32> %res
   1010 }
   1011 
   1012 define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
   1013 ; CHECK-LABEL: test_mask_and_epi32:
   1014 ; CHECK:       ## BB#0:
   1015 ; CHECK-NEXT:    kmovw %edi, %k1
   1016 ; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm2 {%k1}
   1017 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1018 ; CHECK-NEXT:    retq
   1019   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   1020   ret < 16 x i32> %res
   1021 }
   1022 
   1023 declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
   1024 
   1025 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
   1026 ; CHECK-LABEL: test_xor_epi64:
   1027 ; CHECK:       ## BB#0:
   1028 ; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
   1029 ; CHECK-NEXT:    retq
   1030   %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   1031   ret < 8 x i64> %res
   1032 }
   1033 
   1034 define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
   1035 ; CHECK-LABEL: test_mask_xor_epi64:
   1036 ; CHECK:       ## BB#0:
   1037 ; CHECK-NEXT:    kmovw %edi, %k1
   1038 ; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1}
   1039 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1040 ; CHECK-NEXT:    retq
   1041   %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   1042   ret < 8 x i64> %res
   1043 }
   1044 
   1045 declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   1046 
   1047 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
   1048 ; CHECK-LABEL: test_or_epi64:
   1049 ; CHECK:       ## BB#0:
   1050 ; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
   1051 ; CHECK-NEXT:    retq
   1052   %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   1053   ret < 8 x i64> %res
   1054 }
   1055 
   1056 define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
   1057 ; CHECK-LABEL: test_mask_or_epi64:
   1058 ; CHECK:       ## BB#0:
   1059 ; CHECK-NEXT:    kmovw %edi, %k1
   1060 ; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1}
   1061 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1062 ; CHECK-NEXT:    retq
   1063   %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   1064   ret < 8 x i64> %res
   1065 }
   1066 
   1067 declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   1068 
   1069 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
   1070 ; CHECK-LABEL: test_and_epi64:
   1071 ; CHECK:       ## BB#0:
   1072 ; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1073 ; CHECK-NEXT:    retq
   1074   %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   1075   ret < 8 x i64> %res
   1076 }
   1077 
   1078 define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
   1079 ; CHECK-LABEL: test_mask_and_epi64:
   1080 ; CHECK:       ## BB#0:
   1081 ; CHECK-NEXT:    kmovw %edi, %k1
   1082 ; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1}
   1083 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
   1084 ; CHECK-NEXT:    retq
   1085   %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   1086   ret < 8 x i64> %res
   1087 }
   1088 
   1089 declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
   1090