1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s 3 4 declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly 5 6 define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { 7 ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512: 8 ; CHECK: ## BB#0: 9 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 10 ; CHECK-NEXT: kmovw %edi, %k1 11 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} 12 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} 13 ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 14 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 15 ; CHECK-NEXT: retq 16 17 %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1) 18 %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) 19 %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) 20 %res3 = fadd <16 x float> %res, %res1 21 %res4 = fadd <16 x float> %res2, %res3 22 ret <16 x float> %res4 23 } 24 25 declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly 26 27 define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) { 28 ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512: 29 ; CHECK: ## BB#0: 30 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 31 ; CHECK-NEXT: kmovw %edi, %k1 32 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} 33 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} 34 ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 35 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 36 ; CHECK-NEXT: retq 37 38 %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1) 39 %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) 40 %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) 41 %res3 = fadd <8 x double> %res, %res1 42 %res4 = fadd <8 x double> %res2, %res3 43 ret <8 x double> %res4 44 } 45 46 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16) 47 48 define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) { 49 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512: 50 ; CHECK: ## BB#0: 51 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 52 ; CHECK-NEXT: kmovw %edi, %k1 53 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} 54 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} 55 ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 56 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 57 ; CHECK-NEXT: retq 58 %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) 59 %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) 60 %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) 61 %res3 = add <16 x i32> %res, %res1 62 %res4 = add <16 x i32> %res2, %res3 63 ret <16 x i32> %res4 64 } 65 66 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8) 67 68 define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) { 69 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512: 70 ; CHECK: ## BB#0: 71 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 72 ; CHECK-NEXT: kmovw %edi, %k1 73 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} 74 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 75 ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 76 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 77 ; CHECK-NEXT: retq 78 %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) 79 %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask) 80 %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask) 81 %res3 = add <8 x i64> %res, %res1 82 %res4 = add <8 x i64> %res2, %res3 83 ret <8 x i64> %res4 84 } 85 86 declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16) 87 88 define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) { 89 ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512: 90 ; CHECK: ## BB#0: 91 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 92 ; CHECK-NEXT: kmovw %edi, %k1 93 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 94 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 95 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 96 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 97 ; CHECK-NEXT: retq 98 %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) 99 %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) 100 %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) 101 %res3 = fadd <16 x float> %res, %res1 102 %res4 = fadd <16 x float> %res2, %res3 103 ret <16 x float> %res4 104 } 105 106 declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16) 107 108 define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) { 109 ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512: 110 ; CHECK: ## BB#0: 111 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 112 ; CHECK-NEXT: kmovw %edi, %k1 113 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 114 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 115 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 116 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 117 ; CHECK-NEXT: retq 118 %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) 119 %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) 120 %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) 121 %res3 = fadd <16 x float> %res, %res1 122 %res4 = fadd <16 x float> %res2, %res3 123 ret <16 x float> %res4 124 } 125 126 declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) 127 128 define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) { 129 ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512: 130 ; CHECK: ## BB#0: 131 ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] 132 ; CHECK-NEXT: kmovw %edi, %k1 133 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] 134 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 135 ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 136 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 137 ; CHECK-NEXT: retq 138 %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) 139 %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) 140 %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) 141 %res3 = fadd <8 x double> %res, %res1 142 %res4 = fadd <8 x double> %res2, %res3 143 ret <8 x double> %res4 144 } 145 146 declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8) 147 148 define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) { 149 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512: 150 ; CHECK: ## BB#0: 151 ; CHECK-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4] 152 ; CHECK-NEXT: kmovw %esi, %k1 153 ; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] 154 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] 155 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 156 ; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 157 ; CHECK-NEXT: retq 158 %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) 159 %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) 160 %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) 161 %res3 = fadd <8 x double> %res, %res1 162 %res4 = fadd <8 x double> %res3, %res2 163 ret <8 x double> %res4 164 } 165 166 declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8) 167 168 define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) { 169 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512: 170 ; CHECK: ## BB#0: 171 ; CHECK-NEXT: vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4] 172 ; CHECK-NEXT: kmovw %esi, %k1 173 ; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] 174 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] 175 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 176 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 177 ; CHECK-NEXT: retq 178 %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) 179 %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) 180 %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) 181 %res3 = add <8 x i64> %res, %res1 182 %res4 = add <8 x i64> %res3, %res2 183 ret <8 x i64> %res4 184 } 185 186 define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { 187 ; CHECK-LABEL: test_store1: 188 ; CHECK: ## BB#0: 189 ; CHECK-NEXT: kmovw %edx, %k1 190 ; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1} 191 ; CHECK-NEXT: vmovups %zmm0, (%rsi) 192 ; CHECK-NEXT: retq 193 call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask) 194 call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1) 195 ret void 196 } 197 198 declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 ) 199 200 define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) { 201 ; CHECK-LABEL: test_store2: 202 ; CHECK: ## BB#0: 203 ; CHECK-NEXT: kmovw %edx, %k1 204 ; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1} 205 ; CHECK-NEXT: vmovupd %zmm0, (%rsi) 206 ; CHECK-NEXT: retq 207 call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask) 208 call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1) 209 ret void 210 } 211 212 declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8) 213 214 define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { 215 ; CHECK-LABEL: test_mask_store_aligned_ps: 216 ; CHECK: ## BB#0: 217 ; CHECK-NEXT: kmovw %edx, %k1 218 ; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1} 219 ; CHECK-NEXT: vmovaps %zmm0, (%rsi) 220 ; CHECK-NEXT: retq 221 call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask) 222 call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1) 223 ret void 224 } 225 226 declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 ) 227 228 define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) { 229 ; CHECK-LABEL: test_mask_store_aligned_pd: 230 ; CHECK: ## BB#0: 231 ; CHECK-NEXT: kmovw %edx, %k1 232 ; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1} 233 ; CHECK-NEXT: vmovapd %zmm0, (%rsi) 234 ; CHECK-NEXT: retq 235 call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask) 236 call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1) 237 ret void 238 } 239 240 declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8) 241 242 define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) { 243 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512: 244 ; CHECK: ## BB#0: 245 ; CHECK-NEXT: kmovw %edx, %k1 246 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} 247 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi) 248 ; CHECK-NEXT: retq 249 call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2) 250 call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1) 251 ret void 252 } 253 254 declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8) 255 256 define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) { 257 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512: 258 ; CHECK: ## BB#0: 259 ; CHECK-NEXT: kmovw %edx, %k1 260 ; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} 261 ; CHECK-NEXT: vmovdqu32 %zmm0, (%rsi) 262 ; CHECK-NEXT: retq 263 call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2) 264 call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1) 265 ret void 266 } 267 268 declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16) 269 270 define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) { 271 ; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512: 272 ; CHECK: ## BB#0: 273 ; CHECK-NEXT: kmovw %edx, %k1 274 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1} 275 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi) 276 ; CHECK-NEXT: retq 277 call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2) 278 call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1) 279 ret void 280 } 281 282 declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8) 283 284 define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) { 285 ; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512: 286 ; CHECK: ## BB#0: 287 ; CHECK-NEXT: kmovw %edx, %k1 288 ; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1} 289 ; CHECK-NEXT: vmovdqa32 %zmm0, (%rsi) 290 ; CHECK-NEXT: retq 291 call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2) 292 call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1) 293 ret void 294 } 295 296 declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16) 297 298 define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) { 299 ; CHECK-LABEL: test_mask_load_aligned_ps: 300 ; CHECK: ## BB#0: 301 ; CHECK-NEXT: vmovaps (%rdi), %zmm0 302 ; CHECK-NEXT: kmovw %esi, %k1 303 ; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} 304 ; CHECK-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z} 305 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 306 ; CHECK-NEXT: retq 307 %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1) 308 %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask) 309 %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask) 310 %res4 = fadd <16 x float> %res2, %res1 311 ret <16 x float> %res4 312 } 313 314 declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16) 315 316 define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) { 317 ; CHECK-LABEL: test_mask_load_unaligned_ps: 318 ; CHECK: ## BB#0: 319 ; CHECK-NEXT: vmovups (%rdi), %zmm0 320 ; CHECK-NEXT: kmovw %esi, %k1 321 ; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} 322 ; CHECK-NEXT: vmovups (%rdi), %zmm1 {%k1} {z} 323 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 324 ; CHECK-NEXT: retq 325 %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1) 326 %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask) 327 %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask) 328 %res4 = fadd <16 x float> %res2, %res1 329 ret <16 x float> %res4 330 } 331 332 declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16) 333 334 define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { 335 ; CHECK-LABEL: test_mask_load_aligned_pd: 336 ; CHECK: ## BB#0: 337 ; CHECK-NEXT: vmovapd (%rdi), %zmm0 338 ; CHECK-NEXT: kmovw %esi, %k1 339 ; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} 340 ; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z} 341 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 342 ; CHECK-NEXT: retq 343 %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1) 344 %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask) 345 %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask) 346 %res4 = fadd <8 x double> %res2, %res1 347 ret <8 x double> %res4 348 } 349 350 declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8) 351 352 define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { 353 ; CHECK-LABEL: test_mask_load_unaligned_pd: 354 ; CHECK: ## BB#0: 355 ; CHECK-NEXT: vmovupd (%rdi), %zmm0 356 ; CHECK-NEXT: kmovw %esi, %k1 357 ; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} 358 ; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z} 359 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 360 ; CHECK-NEXT: retq 361 %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1) 362 %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask) 363 %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask) 364 %res4 = fadd <8 x double> %res2, %res1 365 ret <8 x double> %res4 366 } 367 368 declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8) 369 370 declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16) 371 372 define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) { 373 ; CHECK-LABEL: test_mask_load_unaligned_d: 374 ; CHECK: ## BB#0: 375 ; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 376 ; CHECK-NEXT: kmovw %edx, %k1 377 ; CHECK-NEXT: vmovdqu32 (%rsi), %zmm0 {%k1} 378 ; CHECK-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} {z} 379 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 380 ; CHECK-NEXT: retq 381 %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1) 382 %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask) 383 %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask) 384 %res4 = add <16 x i32> %res2, %res1 385 ret <16 x i32> %res4 386 } 387 388 declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8) 389 390 define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) { 391 ; CHECK-LABEL: test_mask_load_unaligned_q: 392 ; CHECK: ## BB#0: 393 ; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 394 ; CHECK-NEXT: kmovw %edx, %k1 395 ; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 {%k1} 396 ; CHECK-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} {z} 397 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 398 ; CHECK-NEXT: retq 399 %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1) 400 %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask) 401 %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask) 402 %res4 = add <8 x i64> %res2, %res1 403 ret <8 x i64> %res4 404 } 405 406 declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8*, <16 x i32>, i16) 407 408 define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, i8* %ptr, i16 %mask) { 409 ; CHECK-LABEL: test_mask_load_aligned_d: 410 ; CHECK: ## BB#0: 411 ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 412 ; CHECK-NEXT: kmovw %esi, %k1 413 ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} 414 ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 {%k1} {z} 415 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 416 ; CHECK-NEXT: retq 417 %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1) 418 %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> %res, i16 %mask) 419 %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask) 420 %res4 = add <16 x i32> %res2, %res1 421 ret <16 x i32> %res4 422 } 423 424 declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8*, <8 x i64>, i8) 425 426 define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) { 427 ; CHECK-LABEL: test_mask_load_aligned_q: 428 ; CHECK: ## BB#0: 429 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 430 ; CHECK-NEXT: kmovw %esi, %k1 431 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} 432 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 {%k1} {z} 433 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 434 ; CHECK-NEXT: retq 435 %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1) 436 %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> %res, i8 %mask) 437 %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask) 438 %res4 = add <8 x i64> %res2, %res1 439 ret <8 x i64> %res4 440 } 441 442 declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8) 443 444 define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { 445 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512: 446 ; CHECK: ## BB#0: 447 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm2 = zmm0[0,1,3,2,5,4,6,6] 448 ; CHECK-NEXT: kmovw %edi, %k1 449 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6] 450 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] 451 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 452 ; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 453 ; CHECK-NEXT: retq 454 %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3) 455 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3) 456 %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1) 457 %res3 = fadd <8 x double> %res, %res1 458 %res4 = fadd <8 x double> %res3, %res2 459 ret <8 x double> %res4 460 } 461 462 declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16) 463 464 define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) { 465 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512: 466 ; CHECK: ## BB#0: 467 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] 468 ; CHECK-NEXT: kmovw %edi, %k1 469 ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] 470 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] 471 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 472 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 473 ; CHECK-NEXT: retq 474 %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3) 475 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3) 476 %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) 477 %res3 = fadd <16 x float> %res, %res1 478 %res4 = fadd <16 x float> %res3, %res2 479 ret <16 x float> %res4 480 } 481 482 declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16) 483 484 define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) { 485 ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512: 486 ; CHECK: ## BB#0: 487 ; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] 488 ; CHECK-NEXT: kmovw %esi, %k1 489 ; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] 490 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] 491 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 492 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 493 ; CHECK-NEXT: retq 494 %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) 495 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) 496 %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) 497 %res3 = add <16 x i32> %res, %res1 498 %res4 = add <16 x i32> %res3, %res2 499 ret <16 x i32> %res4 500 } 501 502 define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) { 503 ; CHECK-LABEL: test_pcmpeq_d: 504 ; CHECK: ## BB#0: 505 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 506 ; CHECK-NEXT: kmovw %k0, %eax 507 ; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 508 ; CHECK-NEXT: retq 509 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) 510 ret i16 %res 511 } 512 513 define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { 514 ; CHECK-LABEL: test_mask_pcmpeq_d: 515 ; CHECK: ## BB#0: 516 ; CHECK-NEXT: kmovw %edi, %k1 517 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} 518 ; CHECK-NEXT: kmovw %k0, %eax 519 ; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 520 ; CHECK-NEXT: retq 521 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) 522 ret i16 %res 523 } 524 525 declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16) 526 527 define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) { 528 ; CHECK-LABEL: test_pcmpeq_q: 529 ; CHECK: ## BB#0: 530 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 531 ; CHECK-NEXT: kmovw %k0, %eax 532 ; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> 533 ; CHECK-NEXT: retq 534 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) 535 ret i8 %res 536 } 537 538 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { 539 ; CHECK-LABEL: test_mask_pcmpeq_q: 540 ; CHECK: ## BB#0: 541 ; CHECK-NEXT: kmovw %edi, %k1 542 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} 543 ; CHECK-NEXT: kmovw %k0, %eax 544 ; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> 545 ; CHECK-NEXT: retq 546 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) 547 ret i8 %res 548 } 549 550 declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8) 551 552 define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) { 553 ; CHECK-LABEL: test_pcmpgt_d: 554 ; CHECK: ## BB#0: 555 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 556 ; CHECK-NEXT: kmovw %k0, %eax 557 ; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 558 ; CHECK-NEXT: retq 559 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) 560 ret i16 %res 561 } 562 563 define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { 564 ; CHECK-LABEL: test_mask_pcmpgt_d: 565 ; CHECK: ## BB#0: 566 ; CHECK-NEXT: kmovw %edi, %k1 567 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} 568 ; CHECK-NEXT: kmovw %k0, %eax 569 ; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 570 ; CHECK-NEXT: retq 571 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) 572 ret i16 %res 573 } 574 575 declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16) 576 577 define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) { 578 ; CHECK-LABEL: test_pcmpgt_q: 579 ; CHECK: ## BB#0: 580 ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 581 ; CHECK-NEXT: kmovw %k0, %eax 582 ; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> 583 ; CHECK-NEXT: retq 584 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) 585 ret i8 %res 586 } 587 588 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { 589 ; CHECK-LABEL: test_mask_pcmpgt_q: 590 ; CHECK: ## BB#0: 591 ; CHECK-NEXT: kmovw %edi, %k1 592 ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} 593 ; CHECK-NEXT: kmovw %k0, %eax 594 ; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> 595 ; CHECK-NEXT: retq 596 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) 597 ret i8 %res 598 } 599 600 declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8) 601 602 declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) 603 604 define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { 605 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512: 606 ; CHECK: ## BB#0: 607 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 608 ; CHECK-NEXT: kmovw %edi, %k1 609 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 610 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0 611 ; CHECK-NEXT: retq 612 %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) 613 %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) 614 %res2 = fadd <8 x double> %res, %res1 615 ret <8 x double> %res2 616 } 617 618 declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) 619 620 define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { 621 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512: 622 ; CHECK: ## BB#0: 623 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 624 ; CHECK-NEXT: kmovw %edi, %k1 625 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 626 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 627 ; CHECK-NEXT: retq 628 %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) 629 %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) 630 %res2 = fadd <16 x float> %res, %res1 631 ret <16 x float> %res2 632 } 633 634 declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) 635 636 define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { 637 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512: 638 ; CHECK: ## BB#0: 639 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 640 ; CHECK-NEXT: kmovw %edi, %k1 641 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 642 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0 643 ; CHECK-NEXT: retq 644 %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) 645 %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) 646 %res2 = fadd <8 x double> %res, %res1 647 ret <8 x double> %res2 648 } 649 650 declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) 651 652 define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { 653 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512: 654 ; CHECK: ## BB#0: 655 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 656 ; CHECK-NEXT: kmovw %edi, %k1 657 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 658 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 659 ; CHECK-NEXT: retq 660 %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) 661 %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) 662 %res2 = fadd <16 x float> %res, %res1 663 ret <16 x float> %res2 664 } 665 666 declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 667 668 define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { 669 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512: 670 ; CHECK: ## BB#0: 671 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 672 ; CHECK-NEXT: kmovw %edi, %k1 673 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 674 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 675 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 676 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 677 ; CHECK-NEXT: retq 678 %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) 679 %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) 680 %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3) 681 %res3 = add <8 x i64> %res, %res1 682 %res4 = add <8 x i64> %res2, %res3 683 ret <8 x i64> %res4 684 } 685 686 declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 687 688 define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { 689 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512: 690 ; CHECK: ## BB#0: 691 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 692 ; CHECK-NEXT: kmovw %edi, %k1 693 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 694 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0 695 ; CHECK-NEXT: retq 696 %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) 697 %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) 698 %res2 = add <8 x i64> %res, %res1 699 ret <8 x i64> %res2 700 } 701 702 declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 703 704 define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { 705 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512: 706 ; CHECK: ## BB#0: 707 ; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 708 ; CHECK-NEXT: kmovw %edi, %k1 709 ; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 710 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0 711 ; CHECK-NEXT: retq 712 %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) 713 %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) 714 %res2 = add <16 x i32> %res, %res1 715 ret <16 x i32> %res2 716 } 717 718 declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 719 720 define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { 721 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512: 722 ; CHECK: ## BB#0: 723 ; CHECK-NEXT: vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 724 ; CHECK-NEXT: kmovw %edi, %k1 725 ; CHECK-NEXT: vpunpckldq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 726 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0 727 ; CHECK-NEXT: retq 728 %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) 729 %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) 730 %res2 = add <16 x i32> %res, %res1 731 ret <16 x i32> %res2 732 } 733 734 define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) { 735 ; CHECK-LABEL: test_x86_avx512_pslli_d: 736 ; CHECK: ## BB#0: 737 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 738 ; CHECK-NEXT: retq 739 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) 740 ret <16 x i32> %res 741 } 742 743 define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 744 ; CHECK-LABEL: test_x86_avx512_mask_pslli_d: 745 ; CHECK: ## BB#0: 746 ; CHECK-NEXT: kmovw %edi, %k1 747 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1} 748 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 749 ; CHECK-NEXT: retq 750 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) 751 ret <16 x i32> %res 752 } 753 754 define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) { 755 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d: 756 ; CHECK: ## BB#0: 757 ; CHECK-NEXT: kmovw %edi, %k1 758 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z} 759 ; CHECK-NEXT: retq 760 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) 761 ret <16 x i32> %res 762 } 763 764 declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone 765 766 define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) { 767 ; CHECK-LABEL: test_x86_avx512_pslli_q: 768 ; CHECK: ## BB#0: 769 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 770 ; CHECK-NEXT: retq 771 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) 772 ret <8 x i64> %res 773 } 774 775 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 776 ; CHECK-LABEL: test_x86_avx512_mask_pslli_q: 777 ; CHECK: ## BB#0: 778 ; CHECK-NEXT: kmovw %edi, %k1 779 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1} 780 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 781 ; CHECK-NEXT: retq 782 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) 783 ret <8 x i64> %res 784 } 785 786 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) { 787 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q: 788 ; CHECK: ## BB#0: 789 ; CHECK-NEXT: kmovw %edi, %k1 790 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z} 791 ; CHECK-NEXT: retq 792 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) 793 ret <8 x i64> %res 794 } 795 796 declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone 797 798 define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) { 799 ; CHECK-LABEL: test_x86_avx512_psrli_d: 800 ; CHECK: ## BB#0: 801 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 802 ; CHECK-NEXT: retq 803 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) 804 ret <16 x i32> %res 805 } 806 807 define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 808 ; CHECK-LABEL: test_x86_avx512_mask_psrli_d: 809 ; CHECK: ## BB#0: 810 ; CHECK-NEXT: kmovw %edi, %k1 811 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1} 812 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 813 ; CHECK-NEXT: retq 814 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) 815 ret <16 x i32> %res 816 } 817 818 define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) { 819 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d: 820 ; CHECK: ## BB#0: 821 ; CHECK-NEXT: kmovw %edi, %k1 822 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z} 823 ; CHECK-NEXT: retq 824 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) 825 ret <16 x i32> %res 826 } 827 828 declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone 829 830 define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) { 831 ; CHECK-LABEL: test_x86_avx512_psrli_q: 832 ; CHECK: ## BB#0: 833 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 834 ; CHECK-NEXT: retq 835 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) 836 ret <8 x i64> %res 837 } 838 839 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 840 ; CHECK-LABEL: test_x86_avx512_mask_psrli_q: 841 ; CHECK: ## BB#0: 842 ; CHECK-NEXT: kmovw %edi, %k1 843 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1} 844 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 845 ; CHECK-NEXT: retq 846 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) 847 ret <8 x i64> %res 848 } 849 850 define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) { 851 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q: 852 ; CHECK: ## BB#0: 853 ; CHECK-NEXT: kmovw %edi, %k1 854 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z} 855 ; CHECK-NEXT: retq 856 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) 857 ret <8 x i64> %res 858 } 859 860 declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone 861 862 define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) { 863 ; CHECK-LABEL: test_x86_avx512_psrai_d: 864 ; CHECK: ## BB#0: 865 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 866 ; CHECK-NEXT: retq 867 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) 868 ret <16 x i32> %res 869 } 870 871 define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 872 ; CHECK-LABEL: test_x86_avx512_mask_psrai_d: 873 ; CHECK: ## BB#0: 874 ; CHECK-NEXT: kmovw %edi, %k1 875 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1} 876 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 877 ; CHECK-NEXT: retq 878 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) 879 ret <16 x i32> %res 880 } 881 882 define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) { 883 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d: 884 ; CHECK: ## BB#0: 885 ; CHECK-NEXT: kmovw %edi, %k1 886 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z} 887 ; CHECK-NEXT: retq 888 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) 889 ret <16 x i32> %res 890 } 891 892 declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone 893 894 define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) { 895 ; CHECK-LABEL: test_x86_avx512_psrai_q: 896 ; CHECK: ## BB#0: 897 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 898 ; CHECK-NEXT: retq 899 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) 900 ret <8 x i64> %res 901 } 902 903 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 904 ; CHECK-LABEL: test_x86_avx512_mask_psrai_q: 905 ; CHECK: ## BB#0: 906 ; CHECK-NEXT: kmovw %edi, %k1 907 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1} 908 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 909 ; CHECK-NEXT: retq 910 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) 911 ret <8 x i64> %res 912 } 913 914 define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) { 915 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q: 916 ; CHECK: ## BB#0: 917 ; CHECK-NEXT: kmovw %edi, %k1 918 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z} 919 ; CHECK-NEXT: retq 920 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) 921 ret <8 x i64> %res 922 } 923 924 declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone 925 926 declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>) 927 928 define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) { 929 ; CHECK-LABEL: test_storent_q_512: 930 ; CHECK: ## BB#0: 931 ; CHECK-NEXT: vmovntdq %zmm0, (%rdi) 932 ; CHECK-NEXT: retq 933 call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data) 934 ret void 935 } 936 937 declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>) 938 939 define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) { 940 ; CHECK-LABEL: test_storent_pd_512: 941 ; CHECK: ## BB#0: 942 ; CHECK-NEXT: vmovntpd %zmm0, (%rdi) 943 ; CHECK-NEXT: retq 944 call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data) 945 ret void 946 } 947 948 declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>) 949 950 define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) { 951 ; CHECK-LABEL: test_storent_ps_512: 952 ; CHECK: ## BB#0: 953 ; CHECK-NEXT: vmovntps %zmm0, (%rdi) 954 ; CHECK-NEXT: retq 955 call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data) 956 ret void 957 } 958 959 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) { 960 ; CHECK-LABEL: test_xor_epi32: 961 ; CHECK: ## BB#0: 962 ; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0 963 ; CHECK-NEXT: retq 964 %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) 965 ret < 16 x i32> %res 966 } 967 968 define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) { 969 ; CHECK-LABEL: test_mask_xor_epi32: 970 ; CHECK: ## BB#0: 971 ; CHECK-NEXT: kmovw %edi, %k1 972 ; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1} 973 ; CHECK-NEXT: vmovaps %zmm2, %zmm0 974 ; CHECK-NEXT: retq 975 %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) 976 ret < 16 x i32> %res 977 } 978 979 declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 980 981 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) { 982 ; CHECK-LABEL: test_or_epi32: 983 ; CHECK: ## BB#0: 984 ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 985 ; CHECK-NEXT: retq 986 %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) 987 ret < 16 x i32> %res 988 } 989 990 define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) { 991 ; CHECK-LABEL: test_mask_or_epi32: 992 ; CHECK: ## BB#0: 993 ; CHECK-NEXT: kmovw %edi, %k1 994 ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1} 995 ; CHECK-NEXT: vmovaps %zmm2, %zmm0 996 ; CHECK-NEXT: retq 997 %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) 998 ret < 16 x i32> %res 999 } 1000 1001 declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 1002 1003 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) { 1004 ; CHECK-LABEL: test_and_epi32: 1005 ; CHECK: ## BB#0: 1006 ; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0 1007 ; CHECK-NEXT: retq 1008 %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) 1009 ret < 16 x i32> %res 1010 } 1011 1012 define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) { 1013 ; CHECK-LABEL: test_mask_and_epi32: 1014 ; CHECK: ## BB#0: 1015 ; CHECK-NEXT: kmovw %edi, %k1 1016 ; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1} 1017 ; CHECK-NEXT: vmovaps %zmm2, %zmm0 1018 ; CHECK-NEXT: retq 1019 %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) 1020 ret < 16 x i32> %res 1021 } 1022 1023 declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 1024 1025 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) { 1026 ; CHECK-LABEL: test_xor_epi64: 1027 ; CHECK: ## BB#0: 1028 ; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 1029 ; CHECK-NEXT: retq 1030 %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) 1031 ret < 8 x i64> %res 1032 } 1033 1034 define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) { 1035 ; CHECK-LABEL: test_mask_xor_epi64: 1036 ; CHECK: ## BB#0: 1037 ; CHECK-NEXT: kmovw %edi, %k1 1038 ; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1} 1039 ; CHECK-NEXT: vmovaps %zmm2, %zmm0 1040 ; CHECK-NEXT: retq 1041 %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) 1042 ret < 8 x i64> %res 1043 } 1044 1045 declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 1046 1047 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) { 1048 ; CHECK-LABEL: test_or_epi64: 1049 ; CHECK: ## BB#0: 1050 ; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 1051 ; CHECK-NEXT: retq 1052 %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) 1053 ret < 8 x i64> %res 1054 } 1055 1056 define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) { 1057 ; CHECK-LABEL: test_mask_or_epi64: 1058 ; CHECK: ## BB#0: 1059 ; CHECK-NEXT: kmovw %edi, %k1 1060 ; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1} 1061 ; CHECK-NEXT: vmovaps %zmm2, %zmm0 1062 ; CHECK-NEXT: retq 1063 %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) 1064 ret < 8 x i64> %res 1065 } 1066 1067 declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 1068 1069 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) { 1070 ; CHECK-LABEL: test_and_epi64: 1071 ; CHECK: ## BB#0: 1072 ; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 1073 ; CHECK-NEXT: retq 1074 %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) 1075 ret < 8 x i64> %res 1076 } 1077 1078 define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) { 1079 ; CHECK-LABEL: test_mask_and_epi64: 1080 ; CHECK: ## BB#0: 1081 ; CHECK-NEXT: kmovw %edi, %k1 1082 ; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1} 1083 ; CHECK-NEXT: vmovaps %zmm2, %zmm0 1084 ; CHECK-NEXT: retq 1085 %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) 1086 ret < 8 x i64> %res 1087 } 1088 1089 declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 1090