1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 4 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 5 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX 6 7 ; To test for the case where masked load/store is not legal, we should add a run with a target 8 ; that does not have AVX, but that case should probably be a separate test file using less tests 9 ; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov. 10 11 define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) { 12 ; AVX-LABEL: loadv1: 13 ; AVX: ## %bb.0: 14 ; AVX-NEXT: testq %rdi, %rdi 15 ; AVX-NEXT: ## implicit-def: $xmm1 16 ; AVX-NEXT: je LBB0_1 17 ; AVX-NEXT: ## %bb.2: ## %else 18 ; AVX-NEXT: testq %rdi, %rdi 19 ; AVX-NEXT: jne LBB0_3 20 ; AVX-NEXT: LBB0_4: ## %else 21 ; AVX-NEXT: vmovaps %xmm1, %xmm0 22 ; AVX-NEXT: retq 23 ; AVX-NEXT: LBB0_1: ## %cond.load 24 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 25 ; AVX-NEXT: testq %rdi, %rdi 26 ; AVX-NEXT: je LBB0_4 27 ; AVX-NEXT: LBB0_3: ## %else 28 ; AVX-NEXT: vmovaps %xmm0, %xmm1 29 ; AVX-NEXT: vmovaps %xmm1, %xmm0 30 ; AVX-NEXT: retq 31 ; 32 ; AVX512F-LABEL: loadv1: 33 ; AVX512F: ## %bb.0: 34 ; AVX512F-NEXT: testq %rdi, %rdi 35 ; AVX512F-NEXT: ## implicit-def: $xmm1 36 ; AVX512F-NEXT: jne LBB0_2 37 ; AVX512F-NEXT: ## %bb.1: ## %cond.load 38 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 39 ; AVX512F-NEXT: LBB0_2: ## %else 40 ; AVX512F-NEXT: testq %rdi, %rdi 41 ; AVX512F-NEXT: sete %al 42 ; AVX512F-NEXT: kmovw %eax, %k1 43 ; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} 44 ; AVX512F-NEXT: retq 45 ; 46 ; SKX-LABEL: loadv1: 47 ; SKX: ## %bb.0: 48 ; SKX-NEXT: testq %rdi, %rdi 49 ; SKX-NEXT: ## implicit-def: $xmm1 50 ; SKX-NEXT: jne LBB0_2 51 ; SKX-NEXT: ## %bb.1: ## %cond.load 52 ; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 53 ; SKX-NEXT: LBB0_2: ## %else 54 ; SKX-NEXT: testq %rdi, %rdi 55 ; SKX-NEXT: sete %al 56 ; SKX-NEXT: kmovd %eax, %k1 57 ; SKX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} 58 ; SKX-NEXT: retq 59 %mask = icmp eq <1 x i64> %trigger, zeroinitializer 60 %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst) 61 ret <1 x double> %res 62 } 63 declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>) 64 65 define void @storev1(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) { 66 ; AVX-LABEL: storev1: 67 ; AVX: ## %bb.0: 68 ; AVX-NEXT: testl %edi, %edi 69 ; AVX-NEXT: je LBB1_1 70 ; AVX-NEXT: ## %bb.2: ## %else 71 ; AVX-NEXT: retq 72 ; AVX-NEXT: LBB1_1: ## %cond.store 73 ; AVX-NEXT: movl %edx, (%rsi) 74 ; AVX-NEXT: retq 75 ; 76 ; AVX512-LABEL: storev1: 77 ; AVX512: ## %bb.0: 78 ; AVX512-NEXT: testl %edi, %edi 79 ; AVX512-NEXT: je LBB1_1 80 ; AVX512-NEXT: ## %bb.2: ## %else 81 ; AVX512-NEXT: retq 82 ; AVX512-NEXT: LBB1_1: ## %cond.store 83 ; AVX512-NEXT: movl %edx, (%rsi) 84 ; AVX512-NEXT: retq 85 %mask = icmp eq <1 x i32> %trigger, zeroinitializer 86 call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask) 87 ret void 88 } 89 declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>) 90 91 define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { 92 ; AVX-LABEL: test6: 93 ; AVX: ## %bb.0: 94 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 95 ; AVX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 96 ; AVX-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 97 ; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 98 ; AVX-NEXT: retq 99 ; 100 ; AVX512F-LABEL: test6: 101 ; AVX512F: ## %bb.0: 102 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 103 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 104 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 105 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 106 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 107 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 108 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 109 ; AVX512F-NEXT: vzeroupper 110 ; AVX512F-NEXT: retq 111 ; 112 ; SKX-LABEL: test6: 113 ; SKX: ## %bb.0: 114 ; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 115 ; SKX-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1} 116 ; SKX-NEXT: retq 117 %mask = icmp eq <2 x i64> %trigger, zeroinitializer 118 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) 119 ret <2 x double> %res 120 } 121 122 define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) { 123 ; AVX-LABEL: test7: 124 ; AVX: ## %bb.0: 125 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 126 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 127 ; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 128 ; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 129 ; AVX-NEXT: retq 130 ; 131 ; AVX512F-LABEL: test7: 132 ; AVX512F: ## %bb.0: 133 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 134 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 135 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 136 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 137 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 138 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} 139 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 140 ; AVX512F-NEXT: vzeroupper 141 ; AVX512F-NEXT: retq 142 ; 143 ; SKX-LABEL: test7: 144 ; SKX: ## %bb.0: 145 ; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 146 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} 147 ; SKX-NEXT: retq 148 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 149 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) 150 ret <4 x float> %res 151 } 152 153 define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { 154 ; AVX1-LABEL: test8: 155 ; AVX1: ## %bb.0: 156 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 157 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 158 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 159 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 160 ; AVX1-NEXT: retq 161 ; 162 ; AVX2-LABEL: test8: 163 ; AVX2: ## %bb.0: 164 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 165 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 166 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 167 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 168 ; AVX2-NEXT: retq 169 ; 170 ; AVX512F-LABEL: test8: 171 ; AVX512F: ## %bb.0: 172 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 173 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 174 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 175 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 176 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 177 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} 178 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 179 ; AVX512F-NEXT: vzeroupper 180 ; AVX512F-NEXT: retq 181 ; 182 ; SKX-LABEL: test8: 183 ; SKX: ## %bb.0: 184 ; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 185 ; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} 186 ; SKX-NEXT: retq 187 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 188 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) 189 ret <4 x i32> %res 190 } 191 192 define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { 193 ; AVX1-LABEL: test9: 194 ; AVX1: ## %bb.0: 195 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 196 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 197 ; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 198 ; AVX1-NEXT: retq 199 ; 200 ; AVX2-LABEL: test9: 201 ; AVX2: ## %bb.0: 202 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 203 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 204 ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 205 ; AVX2-NEXT: retq 206 ; 207 ; AVX512F-LABEL: test9: 208 ; AVX512F: ## %bb.0: 209 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 210 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 211 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 212 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 213 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 214 ; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} 215 ; AVX512F-NEXT: vzeroupper 216 ; AVX512F-NEXT: retq 217 ; 218 ; SKX-LABEL: test9: 219 ; SKX: ## %bb.0: 220 ; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 221 ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} 222 ; SKX-NEXT: retq 223 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 224 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) 225 ret void 226 } 227 228 define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) { 229 ; AVX1-LABEL: test10: 230 ; AVX1: ## %bb.0: 231 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 232 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 233 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 234 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 235 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 236 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 237 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 238 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 239 ; AVX1-NEXT: retq 240 ; 241 ; AVX2-LABEL: test10: 242 ; AVX2: ## %bb.0: 243 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 244 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 245 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 246 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 247 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 248 ; AVX2-NEXT: retq 249 ; 250 ; AVX512F-LABEL: test10: 251 ; AVX512F: ## %bb.0: 252 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 253 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 254 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 255 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 256 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 257 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 258 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 259 ; AVX512F-NEXT: retq 260 ; 261 ; SKX-LABEL: test10: 262 ; SKX: ## %bb.0: 263 ; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 264 ; SKX-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} 265 ; SKX-NEXT: retq 266 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 267 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst) 268 ret <4 x double> %res 269 } 270 271 define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) { 272 ; AVX1-LABEL: test10b: 273 ; AVX1: ## %bb.0: 274 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 275 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 276 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 277 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 278 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 279 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 280 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 281 ; AVX1-NEXT: retq 282 ; 283 ; AVX2-LABEL: test10b: 284 ; AVX2: ## %bb.0: 285 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 286 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 287 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 288 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 289 ; AVX2-NEXT: retq 290 ; 291 ; AVX512F-LABEL: test10b: 292 ; AVX512F: ## %bb.0: 293 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 294 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 295 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 296 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 297 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} 298 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 299 ; AVX512F-NEXT: retq 300 ; 301 ; SKX-LABEL: test10b: 302 ; SKX: ## %bb.0: 303 ; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 304 ; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} 305 ; SKX-NEXT: retq 306 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 307 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer) 308 ret <4 x double> %res 309 } 310 311 define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { 312 ; AVX1-LABEL: test11a: 313 ; AVX1: ## %bb.0: 314 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 315 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 316 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 317 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 318 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 319 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 320 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 321 ; AVX1-NEXT: retq 322 ; 323 ; AVX2-LABEL: test11a: 324 ; AVX2: ## %bb.0: 325 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 326 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 327 ; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 328 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 329 ; AVX2-NEXT: retq 330 ; 331 ; AVX512F-LABEL: test11a: 332 ; AVX512F: ## %bb.0: 333 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 334 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 335 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 336 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 337 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 338 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} 339 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 340 ; AVX512F-NEXT: retq 341 ; 342 ; SKX-LABEL: test11a: 343 ; SKX: ## %bb.0: 344 ; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 345 ; SKX-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1} 346 ; SKX-NEXT: retq 347 %mask = icmp eq <8 x i32> %trigger, zeroinitializer 348 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) 349 ret <8 x float> %res 350 } 351 352 define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) { 353 ; AVX1-LABEL: test11b: 354 ; AVX1: ## %bb.0: 355 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 356 ; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 357 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 358 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 359 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 360 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 361 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 362 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 363 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 364 ; AVX1-NEXT: retq 365 ; 366 ; AVX2-LABEL: test11b: 367 ; AVX2: ## %bb.0: 368 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 369 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 370 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 371 ; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 372 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 373 ; AVX2-NEXT: retq 374 ; 375 ; AVX512F-LABEL: test11b: 376 ; AVX512F: ## %bb.0: 377 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 378 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 379 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 380 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 381 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} 382 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 383 ; AVX512F-NEXT: retq 384 ; 385 ; SKX-LABEL: test11b: 386 ; SKX: ## %bb.0: 387 ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 388 ; SKX-NEXT: vpmovw2m %xmm0, %k1 389 ; SKX-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1} 390 ; SKX-NEXT: retq 391 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst) 392 ret <8 x i32> %res 393 } 394 395 define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) { 396 ; AVX1-LABEL: test11c: 397 ; AVX1: ## %bb.0: 398 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 399 ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 400 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 401 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 402 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 403 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 404 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 405 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 406 ; AVX1-NEXT: retq 407 ; 408 ; AVX2-LABEL: test11c: 409 ; AVX2: ## %bb.0: 410 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 411 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 412 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 413 ; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 414 ; AVX2-NEXT: retq 415 ; 416 ; AVX512F-LABEL: test11c: 417 ; AVX512F: ## %bb.0: 418 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 419 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 420 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 421 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} 422 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 423 ; AVX512F-NEXT: retq 424 ; 425 ; SKX-LABEL: test11c: 426 ; SKX: ## %bb.0: 427 ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 428 ; SKX-NEXT: vpmovw2m %xmm0, %k1 429 ; SKX-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} 430 ; SKX-NEXT: retq 431 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer) 432 ret <8 x float> %res 433 } 434 435 define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) { 436 ; AVX1-LABEL: test11d: 437 ; AVX1: ## %bb.0: 438 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 439 ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 440 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 441 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 442 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 443 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 444 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 445 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 446 ; AVX1-NEXT: retq 447 ; 448 ; AVX2-LABEL: test11d: 449 ; AVX2: ## %bb.0: 450 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 451 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 452 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 453 ; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 454 ; AVX2-NEXT: retq 455 ; 456 ; AVX512F-LABEL: test11d: 457 ; AVX512F: ## %bb.0: 458 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 459 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 460 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 461 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} 462 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 463 ; AVX512F-NEXT: retq 464 ; 465 ; SKX-LABEL: test11d: 466 ; SKX: ## %bb.0: 467 ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 468 ; SKX-NEXT: vpmovw2m %xmm0, %k1 469 ; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} 470 ; SKX-NEXT: retq 471 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer) 472 ret <8 x i32> %res 473 } 474 475 define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { 476 ; AVX1-LABEL: test12: 477 ; AVX1: ## %bb.0: 478 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 479 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 480 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 481 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 482 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 483 ; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) 484 ; AVX1-NEXT: vzeroupper 485 ; AVX1-NEXT: retq 486 ; 487 ; AVX2-LABEL: test12: 488 ; AVX2: ## %bb.0: 489 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 490 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 491 ; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) 492 ; AVX2-NEXT: vzeroupper 493 ; AVX2-NEXT: retq 494 ; 495 ; AVX512F-LABEL: test12: 496 ; AVX512F: ## %bb.0: 497 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 498 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 499 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 500 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 501 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 502 ; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} 503 ; AVX512F-NEXT: vzeroupper 504 ; AVX512F-NEXT: retq 505 ; 506 ; SKX-LABEL: test12: 507 ; SKX: ## %bb.0: 508 ; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 509 ; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} 510 ; SKX-NEXT: vzeroupper 511 ; SKX-NEXT: retq 512 %mask = icmp eq <8 x i32> %trigger, zeroinitializer 513 call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) 514 ret void 515 } 516 517 define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { 518 ; AVX1-LABEL: test14: 519 ; AVX1: ## %bb.0: 520 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 521 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 522 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 523 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 524 ; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 525 ; AVX1-NEXT: retq 526 ; 527 ; AVX2-LABEL: test14: 528 ; AVX2: ## %bb.0: 529 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 530 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 531 ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 532 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 533 ; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 534 ; AVX2-NEXT: retq 535 ; 536 ; AVX512F-LABEL: test14: 537 ; AVX512F: ## %bb.0: 538 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 539 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 540 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 541 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 542 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 543 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 544 ; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} 545 ; AVX512F-NEXT: vzeroupper 546 ; AVX512F-NEXT: retq 547 ; 548 ; SKX-LABEL: test14: 549 ; SKX: ## %bb.0: 550 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 551 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 552 ; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 553 ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} 554 ; SKX-NEXT: retq 555 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 556 call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) 557 ret void 558 } 559 560 define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { 561 ; AVX1-LABEL: test15: 562 ; AVX1: ## %bb.0: 563 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 564 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 565 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 566 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 567 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] 568 ; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 569 ; AVX1-NEXT: retq 570 ; 571 ; AVX2-LABEL: test15: 572 ; AVX2: ## %bb.0: 573 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 574 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 575 ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 576 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 577 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 578 ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 579 ; AVX2-NEXT: retq 580 ; 581 ; AVX512F-LABEL: test15: 582 ; AVX512F: ## %bb.0: 583 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 584 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 585 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 586 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 587 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 588 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 589 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} 590 ; AVX512F-NEXT: vzeroupper 591 ; AVX512F-NEXT: retq 592 ; 593 ; SKX-LABEL: test15: 594 ; SKX: ## %bb.0: 595 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 596 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 597 ; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 598 ; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} 599 ; SKX-NEXT: retq 600 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 601 call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) 602 ret void 603 } 604 605 define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { 606 ; AVX1-LABEL: test16: 607 ; AVX1: ## %bb.0: 608 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 609 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 610 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 611 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 612 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 613 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 614 ; AVX1-NEXT: retq 615 ; 616 ; AVX2-LABEL: test16: 617 ; AVX2: ## %bb.0: 618 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 619 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 620 ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 621 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 622 ; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 623 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 624 ; AVX2-NEXT: retq 625 ; 626 ; AVX512F-LABEL: test16: 627 ; AVX512F: ## %bb.0: 628 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 629 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 630 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 631 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 632 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 633 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 634 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} 635 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 636 ; AVX512F-NEXT: vzeroupper 637 ; AVX512F-NEXT: retq 638 ; 639 ; SKX-LABEL: test16: 640 ; SKX: ## %bb.0: 641 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 642 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 643 ; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 644 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} 645 ; SKX-NEXT: retq 646 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 647 %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) 648 ret <2 x float> %res 649 } 650 651 define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { 652 ; AVX1-LABEL: test17: 653 ; AVX1: ## %bb.0: 654 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 655 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 656 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 657 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 658 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 659 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] 660 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 661 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 662 ; AVX1-NEXT: retq 663 ; 664 ; AVX2-LABEL: test17: 665 ; AVX2: ## %bb.0: 666 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 667 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 668 ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 669 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 670 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 671 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] 672 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 673 ; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 674 ; AVX2-NEXT: retq 675 ; 676 ; AVX512F-LABEL: test17: 677 ; AVX512F: ## %bb.0: 678 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 679 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 680 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 681 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 682 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 683 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 684 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} 685 ; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 686 ; AVX512F-NEXT: vzeroupper 687 ; AVX512F-NEXT: retq 688 ; 689 ; SKX-LABEL: test17: 690 ; SKX: ## %bb.0: 691 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 692 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 693 ; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 694 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 695 ; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} 696 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 697 ; SKX-NEXT: retq 698 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 699 %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) 700 ret <2 x i32> %res 701 } 702 703 define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { 704 ; AVX1-LABEL: test18: 705 ; AVX1: ## %bb.0: 706 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 707 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 708 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 709 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 710 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 711 ; AVX1-NEXT: retq 712 ; 713 ; AVX2-LABEL: test18: 714 ; AVX2: ## %bb.0: 715 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 716 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 717 ; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 718 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 719 ; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 720 ; AVX2-NEXT: retq 721 ; 722 ; AVX512F-LABEL: test18: 723 ; AVX512F: ## %bb.0: 724 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 725 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 726 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 727 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 728 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 729 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} 730 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 731 ; AVX512F-NEXT: vzeroupper 732 ; AVX512F-NEXT: retq 733 ; 734 ; SKX-LABEL: test18: 735 ; SKX: ## %bb.0: 736 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 737 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 738 ; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 739 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} 740 ; SKX-NEXT: retq 741 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 742 %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef) 743 ret <2 x float> %res 744 } 745 746 define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) { 747 ; AVX-LABEL: load_all: 748 ; AVX: ## %bb.0: 749 ; AVX-NEXT: vmovups (%rdi), %xmm0 750 ; AVX-NEXT: retq 751 ; 752 ; AVX512F-LABEL: load_all: 753 ; AVX512F: ## %bb.0: 754 ; AVX512F-NEXT: movw $15, %ax 755 ; AVX512F-NEXT: kmovw %eax, %k1 756 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} 757 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 758 ; AVX512F-NEXT: vzeroupper 759 ; AVX512F-NEXT: retq 760 ; 761 ; SKX-LABEL: load_all: 762 ; SKX: ## %bb.0: 763 ; SKX-NEXT: kxnorw %k0, %k0, %k1 764 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} 765 ; SKX-NEXT: retq 766 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 767 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef) 768 ret <4 x float> %res 769 } 770 771 ;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend. 772 773 ; 128-bit FP vectors are supported with AVX. 774 775 define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) { 776 ; AVX-LABEL: mload_constmask_v4f32: 777 ; AVX: ## %bb.0: 778 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] 779 ; AVX-NEXT: retq 780 ; 781 ; AVX512F-LABEL: mload_constmask_v4f32: 782 ; AVX512F: ## %bb.0: 783 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 784 ; AVX512F-NEXT: movw $13, %ax 785 ; AVX512F-NEXT: kmovw %eax, %k1 786 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} 787 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 788 ; AVX512F-NEXT: vzeroupper 789 ; AVX512F-NEXT: retq 790 ; 791 ; SKX-LABEL: mload_constmask_v4f32: 792 ; SKX: ## %bb.0: 793 ; SKX-NEXT: movb $13, %al 794 ; SKX-NEXT: kmovd %eax, %k1 795 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} 796 ; SKX-NEXT: retq 797 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst) 798 ret <4 x float> %res 799 } 800 801 define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) { 802 ; AVX-LABEL: mload_constmask_v2f64: 803 ; AVX: ## %bb.0: 804 ; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 805 ; AVX-NEXT: retq 806 ; 807 ; AVX512-LABEL: mload_constmask_v2f64: 808 ; AVX512: ## %bb.0: 809 ; AVX512-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 810 ; AVX512-NEXT: retq 811 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst) 812 ret <2 x double> %res 813 } 814 815 ; 128-bit integer vectors are supported with AVX2. 816 817 define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) { 818 ; AVX1-LABEL: mload_constmask_v4i32: 819 ; AVX1: ## %bb.0: 820 ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] 821 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1 822 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 823 ; AVX1-NEXT: retq 824 ; 825 ; AVX2-LABEL: mload_constmask_v4i32: 826 ; AVX2: ## %bb.0: 827 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] 828 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1 829 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 830 ; AVX2-NEXT: retq 831 ; 832 ; AVX512F-LABEL: mload_constmask_v4i32: 833 ; AVX512F: ## %bb.0: 834 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 835 ; AVX512F-NEXT: movw $14, %ax 836 ; AVX512F-NEXT: kmovw %eax, %k1 837 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} 838 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 839 ; AVX512F-NEXT: vzeroupper 840 ; AVX512F-NEXT: retq 841 ; 842 ; SKX-LABEL: mload_constmask_v4i32: 843 ; SKX: ## %bb.0: 844 ; SKX-NEXT: movb $14, %al 845 ; SKX-NEXT: kmovd %eax, %k1 846 ; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} 847 ; SKX-NEXT: retq 848 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst) 849 ret <4 x i32> %res 850 } 851 852 define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) { 853 ; AVX-LABEL: mload_constmask_v2i64: 854 ; AVX: ## %bb.0: 855 ; AVX-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0 856 ; AVX-NEXT: retq 857 ; 858 ; AVX512-LABEL: mload_constmask_v2i64: 859 ; AVX512: ## %bb.0: 860 ; AVX512-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0 861 ; AVX512-NEXT: retq 862 %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst) 863 ret <2 x i64> %res 864 } 865 866 ; 256-bit FP vectors are supported with AVX. 867 868 define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) { 869 ; AVX-LABEL: mload_constmask_v8f32: 870 ; AVX: ## %bb.0: 871 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0] 872 ; AVX-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1 873 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 874 ; AVX-NEXT: retq 875 ; 876 ; AVX512F-LABEL: mload_constmask_v8f32: 877 ; AVX512F: ## %bb.0: 878 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 879 ; AVX512F-NEXT: movw $7, %ax 880 ; AVX512F-NEXT: kmovw %eax, %k1 881 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} 882 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 883 ; AVX512F-NEXT: retq 884 ; 885 ; SKX-LABEL: mload_constmask_v8f32: 886 ; SKX: ## %bb.0: 887 ; SKX-NEXT: movb $7, %al 888 ; SKX-NEXT: kmovd %eax, %k1 889 ; SKX-NEXT: vmovups (%rdi), %ymm0 {%k1} 890 ; SKX-NEXT: retq 891 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst) 892 ret <8 x float> %res 893 } 894 895 define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) { 896 ; AVX-LABEL: mload_constmask_v4f64: 897 ; AVX: ## %bb.0: 898 ; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0] 899 ; AVX-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1 900 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] 901 ; AVX-NEXT: retq 902 ; 903 ; AVX512F-LABEL: mload_constmask_v4f64: 904 ; AVX512F: ## %bb.0: 905 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 906 ; AVX512F-NEXT: movb $7, %al 907 ; AVX512F-NEXT: kmovw %eax, %k1 908 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} 909 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 910 ; AVX512F-NEXT: retq 911 ; 912 ; SKX-LABEL: mload_constmask_v4f64: 913 ; SKX: ## %bb.0: 914 ; SKX-NEXT: movb $7, %al 915 ; SKX-NEXT: kmovd %eax, %k1 916 ; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} 917 ; SKX-NEXT: retq 918 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst) 919 ret <4 x double> %res 920 } 921 922 ; 256-bit integer vectors are supported with AVX2. 923 924 define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) { 925 ; AVX-LABEL: mload_constmask_v8i32: 926 ; AVX: ## %bb.0: 927 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7] 928 ; AVX-NEXT: retq 929 ; 930 ; AVX512F-LABEL: mload_constmask_v8i32: 931 ; AVX512F: ## %bb.0: 932 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 933 ; AVX512F-NEXT: movw $135, %ax 934 ; AVX512F-NEXT: kmovw %eax, %k1 935 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} 936 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 937 ; AVX512F-NEXT: retq 938 ; 939 ; SKX-LABEL: mload_constmask_v8i32: 940 ; SKX: ## %bb.0: 941 ; SKX-NEXT: movb $-121, %al 942 ; SKX-NEXT: kmovd %eax, %k1 943 ; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} 944 ; SKX-NEXT: retq 945 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst) 946 ret <8 x i32> %res 947 } 948 949 define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) { 950 ; AVX-LABEL: mload_constmask_v4i64: 951 ; AVX: ## %bb.0: 952 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7] 953 ; AVX-NEXT: retq 954 ; 955 ; AVX512F-LABEL: mload_constmask_v4i64: 956 ; AVX512F: ## %bb.0: 957 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 958 ; AVX512F-NEXT: movb $9, %al 959 ; AVX512F-NEXT: kmovw %eax, %k1 960 ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} 961 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 962 ; AVX512F-NEXT: retq 963 ; 964 ; SKX-LABEL: mload_constmask_v4i64: 965 ; SKX: ## %bb.0: 966 ; SKX-NEXT: movb $9, %al 967 ; SKX-NEXT: kmovd %eax, %k1 968 ; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} 969 ; SKX-NEXT: retq 970 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst) 971 ret <4 x i64> %res 972 } 973 974 ; 512-bit FP vectors are supported with AVX512. 975 976 define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) { 977 ; AVX-LABEL: mload_constmask_v8f64: 978 ; AVX: ## %bb.0: 979 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] 980 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] 981 ; AVX-NEXT: retq 982 ; 983 ; AVX512F-LABEL: mload_constmask_v8f64: 984 ; AVX512F: ## %bb.0: 985 ; AVX512F-NEXT: movb $-121, %al 986 ; AVX512F-NEXT: kmovw %eax, %k1 987 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} 988 ; AVX512F-NEXT: retq 989 ; 990 ; SKX-LABEL: mload_constmask_v8f64: 991 ; SKX: ## %bb.0: 992 ; SKX-NEXT: movb $-121, %al 993 ; SKX-NEXT: kmovd %eax, %k1 994 ; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1} 995 ; SKX-NEXT: retq 996 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst) 997 ret <8 x double> %res 998 } 999 1000 ; If the pass-through operand is undef, no blend is needed. 1001 1002 define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) { 1003 ; AVX-LABEL: mload_constmask_v4f64_undef_passthrough: 1004 ; AVX: ## %bb.0: 1005 ; AVX-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0] 1006 ; AVX-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 1007 ; AVX-NEXT: retq 1008 ; 1009 ; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough: 1010 ; AVX512F: ## %bb.0: 1011 ; AVX512F-NEXT: movb $7, %al 1012 ; AVX512F-NEXT: kmovw %eax, %k1 1013 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} 1014 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1015 ; AVX512F-NEXT: retq 1016 ; 1017 ; SKX-LABEL: mload_constmask_v4f64_undef_passthrough: 1018 ; SKX: ## %bb.0: 1019 ; SKX-NEXT: movb $7, %al 1020 ; SKX-NEXT: kmovd %eax, %k1 1021 ; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} 1022 ; SKX-NEXT: retq 1023 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef) 1024 ret <4 x double> %res 1025 } 1026 1027 define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) { 1028 ; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough: 1029 ; AVX1: ## %bb.0: 1030 ; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] 1031 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 1032 ; AVX1-NEXT: retq 1033 ; 1034 ; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough: 1035 ; AVX2: ## %bb.0: 1036 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] 1037 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 1038 ; AVX2-NEXT: retq 1039 ; 1040 ; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough: 1041 ; AVX512F: ## %bb.0: 1042 ; AVX512F-NEXT: movb $6, %al 1043 ; AVX512F-NEXT: kmovw %eax, %k1 1044 ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} 1045 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 1046 ; AVX512F-NEXT: retq 1047 ; 1048 ; SKX-LABEL: mload_constmask_v4i64_undef_passthrough: 1049 ; SKX: ## %bb.0: 1050 ; SKX-NEXT: movb $6, %al 1051 ; SKX-NEXT: kmovd %eax, %k1 1052 ; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} 1053 ; SKX-NEXT: retq 1054 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef) 1055 ret <4 x i64> %res 1056 } 1057 1058 define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { 1059 ; AVX1-LABEL: test21: 1060 ; AVX1: ## %bb.0: 1061 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1062 ; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 1063 ; AVX1-NEXT: retq 1064 ; 1065 ; AVX2-LABEL: test21: 1066 ; AVX2: ## %bb.0: 1067 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1068 ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 1069 ; AVX2-NEXT: retq 1070 ; 1071 ; AVX512F-LABEL: test21: 1072 ; AVX512F: ## %bb.0: 1073 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1074 ; AVX512F-NEXT: movw $15, %ax 1075 ; AVX512F-NEXT: kmovw %eax, %k1 1076 ; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} 1077 ; AVX512F-NEXT: vzeroupper 1078 ; AVX512F-NEXT: retq 1079 ; 1080 ; SKX-LABEL: test21: 1081 ; SKX: ## %bb.0: 1082 ; SKX-NEXT: kxnorw %k0, %k0, %k1 1083 ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} 1084 ; SKX-NEXT: retq 1085 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 1086 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>) 1087 ret void 1088 } 1089 1090 ; When only one element of the mask is set, reduce to a scalar store. 1091 1092 define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) { 1093 ; AVX-LABEL: one_mask_bit_set1: 1094 ; AVX: ## %bb.0: 1095 ; AVX-NEXT: vmovss %xmm0, (%rdi) 1096 ; AVX-NEXT: retq 1097 ; 1098 ; AVX512-LABEL: one_mask_bit_set1: 1099 ; AVX512: ## %bb.0: 1100 ; AVX512-NEXT: vmovss %xmm0, (%rdi) 1101 ; AVX512-NEXT: retq 1102 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>) 1103 ret void 1104 } 1105 1106 ; Choose a different element to show that the correct address offset is produced. 1107 1108 define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) { 1109 ; AVX-LABEL: one_mask_bit_set2: 1110 ; AVX: ## %bb.0: 1111 ; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi) 1112 ; AVX-NEXT: retq 1113 ; 1114 ; AVX512-LABEL: one_mask_bit_set2: 1115 ; AVX512: ## %bb.0: 1116 ; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi) 1117 ; AVX512-NEXT: retq 1118 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>) 1119 ret void 1120 } 1121 1122 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. 1123 1124 define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { 1125 ; AVX-LABEL: one_mask_bit_set3: 1126 ; AVX: ## %bb.0: 1127 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1128 ; AVX-NEXT: vmovlps %xmm0, 16(%rdi) 1129 ; AVX-NEXT: vzeroupper 1130 ; AVX-NEXT: retq 1131 ; 1132 ; AVX512-LABEL: one_mask_bit_set3: 1133 ; AVX512: ## %bb.0: 1134 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1135 ; AVX512-NEXT: vmovlps %xmm0, 16(%rdi) 1136 ; AVX512-NEXT: vzeroupper 1137 ; AVX512-NEXT: retq 1138 call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>) 1139 ret void 1140 } 1141 1142 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. 1143 1144 define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) { 1145 ; AVX-LABEL: one_mask_bit_set4: 1146 ; AVX: ## %bb.0: 1147 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1148 ; AVX-NEXT: vmovhpd %xmm0, 24(%rdi) 1149 ; AVX-NEXT: vzeroupper 1150 ; AVX-NEXT: retq 1151 ; 1152 ; AVX512-LABEL: one_mask_bit_set4: 1153 ; AVX512: ## %bb.0: 1154 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1155 ; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi) 1156 ; AVX512-NEXT: vzeroupper 1157 ; AVX512-NEXT: retq 1158 call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>) 1159 ret void 1160 } 1161 1162 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected. 1163 1164 define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { 1165 ; AVX-LABEL: one_mask_bit_set5: 1166 ; AVX: ## %bb.0: 1167 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 1168 ; AVX-NEXT: vmovlps %xmm0, 48(%rdi) 1169 ; AVX-NEXT: vzeroupper 1170 ; AVX-NEXT: retq 1171 ; 1172 ; AVX512-LABEL: one_mask_bit_set5: 1173 ; AVX512: ## %bb.0: 1174 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1175 ; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) 1176 ; AVX512-NEXT: vzeroupper 1177 ; AVX512-NEXT: retq 1178 call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>) 1179 ret void 1180 } 1181 1182 ; When only one element of the mask is set, reduce to a scalar load. 1183 1184 define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) { 1185 ; AVX-LABEL: load_one_mask_bit_set1: 1186 ; AVX: ## %bb.0: 1187 ; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 1188 ; AVX-NEXT: retq 1189 ; 1190 ; AVX512-LABEL: load_one_mask_bit_set1: 1191 ; AVX512: ## %bb.0: 1192 ; AVX512-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 1193 ; AVX512-NEXT: retq 1194 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val) 1195 ret <4 x i32> %res 1196 } 1197 1198 ; Choose a different element to show that the correct address offset is produced. 1199 1200 define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) { 1201 ; AVX-LABEL: load_one_mask_bit_set2: 1202 ; AVX: ## %bb.0: 1203 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1204 ; AVX-NEXT: retq 1205 ; 1206 ; AVX512-LABEL: load_one_mask_bit_set2: 1207 ; AVX512: ## %bb.0: 1208 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1209 ; AVX512-NEXT: retq 1210 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val) 1211 ret <4 x float> %res 1212 } 1213 1214 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. 1215 1216 define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { 1217 ; AVX1-LABEL: load_one_mask_bit_set3: 1218 ; AVX1: ## %bb.0: 1219 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1220 ; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 1221 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1222 ; AVX1-NEXT: retq 1223 ; 1224 ; AVX2-LABEL: load_one_mask_bit_set3: 1225 ; AVX2: ## %bb.0: 1226 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1227 ; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 1228 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1229 ; AVX2-NEXT: retq 1230 ; 1231 ; AVX512-LABEL: load_one_mask_bit_set3: 1232 ; AVX512: ## %bb.0: 1233 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1234 ; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 1235 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1236 ; AVX512-NEXT: retq 1237 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val) 1238 ret <4 x i64> %res 1239 } 1240 1241 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. 1242 1243 define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) { 1244 ; AVX-LABEL: load_one_mask_bit_set4: 1245 ; AVX: ## %bb.0: 1246 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1247 ; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1248 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1249 ; AVX-NEXT: retq 1250 ; 1251 ; AVX512-LABEL: load_one_mask_bit_set4: 1252 ; AVX512: ## %bb.0: 1253 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1254 ; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1255 ; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1256 ; AVX512-NEXT: retq 1257 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val) 1258 ret <4 x double> %res 1259 } 1260 1261 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected. 1262 1263 define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { 1264 ; AVX-LABEL: load_one_mask_bit_set5: 1265 ; AVX: ## %bb.0: 1266 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 1267 ; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] 1268 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1269 ; AVX-NEXT: retq 1270 ; 1271 ; AVX512-LABEL: load_one_mask_bit_set5: 1272 ; AVX512: ## %bb.0: 1273 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 1274 ; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1275 ; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 1276 ; AVX512-NEXT: retq 1277 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val) 1278 ret <8 x double> %res 1279 } 1280 1281 ; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed. 1282 ; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that. 1283 1284 define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) { 1285 ; AVX-LABEL: trunc_mask: 1286 ; AVX: ## %bb.0: 1287 ; AVX-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi) 1288 ; AVX-NEXT: retq 1289 ; 1290 ; AVX512F-LABEL: trunc_mask: 1291 ; AVX512F: ## %bb.0: 1292 ; AVX512F-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 1293 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1294 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1295 ; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 1296 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 1297 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 1298 ; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} 1299 ; AVX512F-NEXT: vzeroupper 1300 ; AVX512F-NEXT: retq 1301 ; 1302 ; SKX-LABEL: trunc_mask: 1303 ; SKX: ## %bb.0: 1304 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1305 ; SKX-NEXT: vpcmpgtd %xmm2, %xmm1, %k1 1306 ; SKX-NEXT: vmovups %xmm0, (%rdi) {%k1} 1307 ; SKX-NEXT: retq 1308 %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer 1309 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask) 1310 ret void 1311 } 1312 1313 ; This needs to be widened to v4i32. 1314 ; This used to assert in type legalization. PR38436 1315 ; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask. 1316 define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { 1317 ; AVX1-LABEL: widen_masked_store: 1318 ; AVX1: ## %bb.0: 1319 ; AVX1-NEXT: vmovd %edx, %xmm1 1320 ; AVX1-NEXT: vmovd %esi, %xmm2 1321 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1322 ; AVX1-NEXT: vmovd %ecx, %xmm2 1323 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1324 ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 1325 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 1326 ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) 1327 ; AVX1-NEXT: retq 1328 ; 1329 ; AVX2-LABEL: widen_masked_store: 1330 ; AVX2: ## %bb.0: 1331 ; AVX2-NEXT: vmovd %edx, %xmm1 1332 ; AVX2-NEXT: vmovd %esi, %xmm2 1333 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1334 ; AVX2-NEXT: vmovd %ecx, %xmm2 1335 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1336 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 1337 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 1338 ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) 1339 ; AVX2-NEXT: retq 1340 ; 1341 ; AVX512F-LABEL: widen_masked_store: 1342 ; AVX512F: ## %bb.0: 1343 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1344 ; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 1345 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 1346 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 1347 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 1348 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] 1349 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 1350 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 1351 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 1352 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} 1353 ; AVX512F-NEXT: vzeroupper 1354 ; AVX512F-NEXT: retq 1355 ; 1356 ; SKX-LABEL: widen_masked_store: 1357 ; SKX: ## %bb.0: 1358 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 1359 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 1360 ; SKX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1361 ; SKX-NEXT: vmovdqa32 %xmm1, %xmm1 {%k1} {z} 1362 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 1363 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] 1364 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 1365 ; SKX-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} 1366 ; SKX-NEXT: retq 1367 call void @llvm.masked.store.v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask) 1368 ret void 1369 } 1370 declare void @llvm.masked.store.v3i32(<3 x i32>, <3 x i32>*, i32, <3 x i1>) 1371 1372 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) 1373 declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) 1374 declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>) 1375 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>) 1376 declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) 1377 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) 1378 declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>) 1379 declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) 1380 declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) 1381 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) 1382 declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) 1383 declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) 1384 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) 1385 declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) 1386 declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) 1387 declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) 1388 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) 1389 declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) 1390 declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>) 1391 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) 1392 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) 1393 1394