1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 3 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX 4 5 define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) { 6 ; AVX512-LABEL: test1: 7 ; AVX512: ## %bb.0: 8 ; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 9 ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} 10 ; AVX512-NEXT: retq 11 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 12 %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) 13 ret <16 x i32> %res 14 } 15 16 define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) { 17 ; AVX512-LABEL: test2: 18 ; AVX512: ## %bb.0: 19 ; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 20 ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} 21 ; AVX512-NEXT: retq 22 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 23 %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) 24 ret <16 x i32> %res 25 } 26 27 define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) { 28 ; AVX512-LABEL: test3: 29 ; AVX512: ## %bb.0: 30 ; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 31 ; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} 32 ; AVX512-NEXT: vzeroupper 33 ; AVX512-NEXT: retq 34 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 35 call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) 36 ret void 37 } 38 39 define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) { 40 ; AVX512-LABEL: test4: 41 ; AVX512: ## %bb.0: 42 ; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 43 ; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} 44 ; AVX512-NEXT: retq 45 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 46 %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) 47 ret <16 x float> %res 48 } 49 50 define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) { 51 ; AVX512-LABEL: test13: 52 ; AVX512: ## %bb.0: 53 ; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 54 ; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1} 55 ; AVX512-NEXT: vzeroupper 56 ; AVX512-NEXT: retq 57 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 58 call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) 59 ret void 60 } 61 62 define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { 63 ; AVX512-LABEL: one_mask_bit_set5: 64 ; AVX512: ## %bb.0: 65 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 66 ; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) 67 ; AVX512-NEXT: vzeroupper 68 ; AVX512-NEXT: retq 69 call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>) 70 ret void 71 } 72 73 define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { 74 ; 75 ; AVX512-LABEL: load_one_mask_bit_set5: 76 ; AVX512: ## %bb.0: 77 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 78 ; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 79 ; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 80 ; AVX512-NEXT: retq 81 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val) 82 ret <8 x double> %res 83 } 84 85 declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) 86 declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) 87 declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) 88 declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) 89 declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) 90 declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) 91 92 declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>) 93 94 define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) { 95 ; AVX512-LABEL: test23: 96 ; AVX512: ## %bb.0: 97 ; AVX512-NEXT: vptestnmq %zmm1, %zmm1, %k1 98 ; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k2 99 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k2} {z} 100 ; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} 101 ; AVX512-NEXT: retq 102 %mask = icmp eq <16 x i32*> %trigger, zeroinitializer 103 %res = call <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer) 104 ret <16 x i32*> %res 105 } 106 107 %mystruct = type { i16, i16, [1 x i8*] } 108 109 declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>) 110 111 define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) { 112 ; AVX512F-LABEL: test24: 113 ; AVX512F: ## %bb.0: 114 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 115 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 116 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 117 ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} 118 ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 119 ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} 120 ; AVX512F-NEXT: retq 121 ; 122 ; SKX-LABEL: test24: 123 ; SKX: ## %bb.0: 124 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 125 ; SKX-NEXT: vpmovb2m %xmm0, %k1 126 ; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} 127 ; SKX-NEXT: kshiftrw $8, %k1, %k1 128 ; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} 129 ; SKX-NEXT: retq 130 %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer) 131 ret <16 x %mystruct*> %res 132 } 133 134 define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 135 ; AVX512F-LABEL: test_store_16i64: 136 ; AVX512F: ## %bb.0: 137 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 138 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 139 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 140 ; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} 141 ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 142 ; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} 143 ; AVX512F-NEXT: vzeroupper 144 ; AVX512F-NEXT: retq 145 ; 146 ; SKX-LABEL: test_store_16i64: 147 ; SKX: ## %bb.0: 148 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 149 ; SKX-NEXT: vpmovb2m %xmm0, %k1 150 ; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} 151 ; SKX-NEXT: kshiftrw $8, %k1, %k1 152 ; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} 153 ; SKX-NEXT: vzeroupper 154 ; SKX-NEXT: retq 155 call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask) 156 ret void 157 } 158 declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) 159 160 define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { 161 ; AVX512F-LABEL: test_store_16f64: 162 ; AVX512F: ## %bb.0: 163 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 164 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 165 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 166 ; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1} 167 ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 168 ; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} 169 ; AVX512F-NEXT: vzeroupper 170 ; AVX512F-NEXT: retq 171 ; 172 ; SKX-LABEL: test_store_16f64: 173 ; SKX: ## %bb.0: 174 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 175 ; SKX-NEXT: vpmovb2m %xmm0, %k1 176 ; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1} 177 ; SKX-NEXT: kshiftrw $8, %k1, %k1 178 ; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} 179 ; SKX-NEXT: vzeroupper 180 ; SKX-NEXT: retq 181 call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask) 182 ret void 183 } 184 declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) 185 186 define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 187 ; AVX512F-LABEL: test_load_16i64: 188 ; AVX512F: ## %bb.0: 189 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 190 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 191 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 192 ; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 193 ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 194 ; AVX512F-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} 195 ; AVX512F-NEXT: retq 196 ; 197 ; SKX-LABEL: test_load_16i64: 198 ; SKX: ## %bb.0: 199 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 200 ; SKX-NEXT: vpmovb2m %xmm0, %k1 201 ; SKX-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 202 ; SKX-NEXT: kshiftrw $8, %k1, %k1 203 ; SKX-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} 204 ; SKX-NEXT: retq 205 %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) 206 ret <16 x i64> %res 207 } 208 declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) 209 210 define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { 211 ; AVX512F-LABEL: test_load_16f64: 212 ; AVX512F: ## %bb.0: 213 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 214 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 215 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 216 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 217 ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 218 ; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} 219 ; AVX512F-NEXT: retq 220 ; 221 ; SKX-LABEL: test_load_16f64: 222 ; SKX: ## %bb.0: 223 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 224 ; SKX-NEXT: vpmovb2m %xmm0, %k1 225 ; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 226 ; SKX-NEXT: kshiftrw $8, %k1, %k1 227 ; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} 228 ; SKX-NEXT: retq 229 %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) 230 ret <16 x double> %res 231 } 232 declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) 233 234 define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) { 235 ; AVX512F-LABEL: test_load_32f64: 236 ; AVX512F: ## %bb.0: 237 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 238 ; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 239 ; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 240 ; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 241 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 242 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 243 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 244 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2} 245 ; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1} 246 ; AVX512F-NEXT: kshiftrw $8, %k2, %k2 247 ; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} 248 ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 249 ; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} 250 ; AVX512F-NEXT: vmovapd %zmm5, %zmm2 251 ; AVX512F-NEXT: retq 252 ; 253 ; SKX-LABEL: test_load_32f64: 254 ; SKX: ## %bb.0: 255 ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 256 ; SKX-NEXT: vpmovb2m %ymm0, %k1 257 ; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 258 ; SKX-NEXT: kshiftrw $8, %k1, %k2 259 ; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} 260 ; SKX-NEXT: kshiftrd $16, %k1, %k1 261 ; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1} 262 ; SKX-NEXT: kshiftrw $8, %k1, %k1 263 ; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} 264 ; SKX-NEXT: retq 265 %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) 266 ret <32 x double> %res 267 } 268 269 declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) 270