1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64 3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32 4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX 5 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32 6 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR 7 8 9 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 10 target triple = "x86_64-unknown-linux-gnu" 11 12 13 ; SCALAR-LABEL: test1 14 ; SCALAR: extractelement <16 x float*> 15 ; SCALAR-NEXT: load float 16 ; SCALAR-NEXT: insertelement <16 x float> 17 ; SCALAR-NEXT: extractelement <16 x float*> 18 ; SCALAR-NEXT: load float 19 20 define <16 x float> @test1(float* %base, <16 x i32> %ind) { 21 ; KNL_64-LABEL: test1: 22 ; KNL_64: # BB#0: 23 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1 24 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 25 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 26 ; KNL_64-NEXT: retq 27 ; 28 ; KNL_32-LABEL: test1: 29 ; KNL_32: # BB#0: 30 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 31 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1 32 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 33 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 34 ; KNL_32-NEXT: retl 35 ; 36 ; SKX-LABEL: test1: 37 ; SKX: # BB#0: 38 ; SKX-NEXT: kxnorw %k1, %k1, %k1 39 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 40 ; SKX-NEXT: vmovaps %zmm1, %zmm0 41 ; SKX-NEXT: retq 42 43 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 44 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 45 46 %sext_ind = sext <16 x i32> %ind to <16 x i64> 47 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 48 49 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 50 ret <16 x float>%res 51 } 52 53 declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) 54 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>) 55 declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> ) 56 57 58 ; SCALAR-LABEL: test2 59 ; SCALAR: extractelement <16 x float*> 60 ; SCALAR-NEXT: load float 61 ; SCALAR-NEXT: insertelement <16 x float> 62 ; SCALAR-NEXT: br label %else 63 ; SCALAR: else: 64 ; SCALAR-NEXT: %res.phi.else = phi 65 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 66 ; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true 67 ; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2 68 69 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { 70 ; KNL_64-LABEL: test2: 71 ; KNL_64: # BB#0: 72 ; KNL_64-NEXT: kmovw %esi, %k1 73 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 74 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 75 ; KNL_64-NEXT: retq 76 ; 77 ; KNL_32-LABEL: test2: 78 ; KNL_32: # BB#0: 79 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 80 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 81 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 82 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 83 ; KNL_32-NEXT: retl 84 ; 85 ; SKX-LABEL: test2: 86 ; SKX: # BB#0: 87 ; SKX-NEXT: kmovw %esi, %k1 88 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 89 ; SKX-NEXT: vmovaps %zmm1, %zmm0 90 ; SKX-NEXT: retq 91 92 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 93 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 94 95 %sext_ind = sext <16 x i32> %ind to <16 x i64> 96 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 97 %imask = bitcast i16 %mask to <16 x i1> 98 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef) 99 ret <16 x float> %res 100 } 101 102 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { 103 ; KNL_64-LABEL: test3: 104 ; KNL_64: # BB#0: 105 ; KNL_64-NEXT: kmovw %esi, %k1 106 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 107 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 108 ; KNL_64-NEXT: retq 109 ; 110 ; KNL_32-LABEL: test3: 111 ; KNL_32: # BB#0: 112 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 113 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 114 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} 115 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 116 ; KNL_32-NEXT: retl 117 ; 118 ; SKX-LABEL: test3: 119 ; SKX: # BB#0: 120 ; SKX-NEXT: kmovw %esi, %k1 121 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 122 ; SKX-NEXT: vmovaps %zmm1, %zmm0 123 ; SKX-NEXT: retq 124 125 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 126 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 127 128 %sext_ind = sext <16 x i32> %ind to <16 x i64> 129 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind 130 %imask = bitcast i16 %mask to <16 x i1> 131 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 132 ret <16 x i32> %res 133 } 134 135 136 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { 137 ; KNL_64-LABEL: test4: 138 ; KNL_64: # BB#0: 139 ; KNL_64-NEXT: kmovw %esi, %k1 140 ; KNL_64-NEXT: kmovw %k1, %k2 141 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 142 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2 143 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 144 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0 145 ; KNL_64-NEXT: retq 146 ; 147 ; KNL_32-LABEL: test4: 148 ; KNL_32: # BB#0: 149 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 150 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 151 ; KNL_32-NEXT: kmovw %k1, %k2 152 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} 153 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2 154 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} 155 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 156 ; KNL_32-NEXT: retl 157 ; 158 ; SKX-LABEL: test4: 159 ; SKX: # BB#0: 160 ; SKX-NEXT: kmovw %esi, %k1 161 ; SKX-NEXT: kmovw %k1, %k2 162 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 163 ; SKX-NEXT: vmovaps %zmm1, %zmm2 164 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 165 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0 166 ; SKX-NEXT: retq 167 168 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 169 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 170 171 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind 172 %imask = bitcast i16 %mask to <16 x i1> 173 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 174 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 175 %res = add <16 x i32> %gt1, %gt2 176 ret <16 x i32> %res 177 } 178 179 180 ; SCALAR-LABEL: test5 181 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0 182 ; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true 183 ; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else 184 ; SCALAR: cond.store: 185 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0 186 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0 187 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4 188 ; SCALAR-NEXT: br label %else 189 ; SCALAR: else: 190 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 191 ; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true 192 ; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2 193 194 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { 195 ; KNL_64-LABEL: test5: 196 ; KNL_64: # BB#0: 197 ; KNL_64-NEXT: kmovw %esi, %k1 198 ; KNL_64-NEXT: kmovw %k1, %k2 199 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 200 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 201 ; KNL_64-NEXT: retq 202 ; 203 ; KNL_32-LABEL: test5: 204 ; KNL_32: # BB#0: 205 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 206 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 207 ; KNL_32-NEXT: kmovw %k1, %k2 208 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} 209 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} 210 ; KNL_32-NEXT: retl 211 ; 212 ; SKX-LABEL: test5: 213 ; SKX: # BB#0: 214 ; SKX-NEXT: kmovw %esi, %k1 215 ; SKX-NEXT: kmovw %k1, %k2 216 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 217 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 218 ; SKX-NEXT: retq 219 220 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 221 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 222 223 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind 224 %imask = bitcast i16 %mask to <16 x i1> 225 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) 226 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) 227 ret void 228 } 229 230 declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) 231 declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> ) 232 233 234 ; SCALAR-LABEL: test6 235 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4 236 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1 237 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1 238 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4 239 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2 240 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2 241 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4 242 243 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { 244 ; KNL_64-LABEL: test6: 245 ; KNL_64: # BB#0: 246 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1 247 ; KNL_64-NEXT: kxnorw %k2, %k2, %k2 248 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 249 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 250 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 251 ; KNL_64-NEXT: retq 252 ; 253 ; KNL_32-LABEL: test6: 254 ; KNL_32: # BB#0: 255 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1 256 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2 257 ; KNL_32-NEXT: kxnorw %k2, %k2, %k2 258 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} 259 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} 260 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 261 ; KNL_32-NEXT: retl 262 ; 263 ; SKX-LABEL: test6: 264 ; SKX: # BB#0: 265 ; SKX-NEXT: kxnorw %k1, %k1, %k1 266 ; SKX-NEXT: kxnorw %k2, %k2, %k2 267 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 268 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 269 ; SKX-NEXT: vmovaps %zmm2, %zmm0 270 ; SKX-NEXT: retq 271 272 %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 273 274 call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 275 ret <8 x i32>%a 276 } 277 278 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { 279 ; 280 ; KNL_64-LABEL: test7: 281 ; KNL_64: # BB#0: 282 ; KNL_64-NEXT: movzbl %sil, %eax 283 ; KNL_64-NEXT: kmovw %eax, %k1 284 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 285 ; KNL_64-NEXT: kmovw %k1, %k2 286 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} 287 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2 288 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} 289 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 290 ; KNL_64-NEXT: retq 291 ; 292 ; KNL_32-LABEL: test7: 293 ; KNL_32: # BB#0: 294 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 295 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 296 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 297 ; KNL_32-NEXT: kmovw %k1, %k2 298 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2} 299 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2 300 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} 301 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 302 ; KNL_32-NEXT: retl 303 ; 304 ; SKX-LABEL: test7: 305 ; SKX: # BB#0: 306 ; SKX-NEXT: kmovb %esi, %k1 307 ; SKX-NEXT: kmovw %k1, %k2 308 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2} 309 ; SKX-NEXT: vmovaps %zmm1, %zmm2 310 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1} 311 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0 312 ; SKX-NEXT: retq 313 314 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0 315 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer 316 317 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind 318 %imask = bitcast i8 %mask to <8 x i1> 319 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef) 320 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1) 321 %res = add <8 x i32> %gt1, %gt2 322 ret <8 x i32> %res 323 } 324 325 ; No uniform base in this case, index <8 x i64> contains addresses, 326 ; each gather call will be split into two 327 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { 328 ; KNL_64-LABEL: test8: 329 ; KNL_64: # BB#0: 330 ; KNL_64-NEXT: kmovw %edi, %k1 331 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 332 ; KNL_64-NEXT: kmovw %k2, %k3 333 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} 334 ; KNL_64-NEXT: kmovw %k1, %k3 335 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} 336 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 337 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 338 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 339 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 340 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 341 ; KNL_64-NEXT: retq 342 ; 343 ; KNL_32-LABEL: test8: 344 ; KNL_32: # BB#0: 345 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 346 ; KNL_32-NEXT: kmovw %k1, %k2 347 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 348 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2 349 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 350 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 351 ; KNL_32-NEXT: retl 352 ; 353 ; SKX-LABEL: test8: 354 ; SKX: # BB#0: 355 ; SKX-NEXT: kmovw %edi, %k1 356 ; SKX-NEXT: kshiftrw $8, %k1, %k2 357 ; SKX-NEXT: kmovw %k2, %k3 358 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} 359 ; SKX-NEXT: kmovw %k1, %k3 360 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} 361 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4 362 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 363 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 364 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 365 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 366 ; SKX-NEXT: retq 367 ; 368 ; SKX_32-LABEL: test8: 369 ; SKX_32: # BB#0: 370 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 371 ; SKX_32-NEXT: kmovw %k1, %k2 372 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 373 ; SKX_32-NEXT: vmovaps %zmm1, %zmm2 374 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 375 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 376 ; SKX_32-NEXT: retl 377 378 %imask = bitcast i16 %mask to <16 x i1> 379 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 380 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 381 %res = add <16 x i32> %gt1, %gt2 382 ret <16 x i32> %res 383 } 384 385 %struct.RT = type { i8, [10 x [20 x i32]], i8 } 386 %struct.ST = type { i32, double, %struct.RT } 387 388 ; Masked gather for agregate types 389 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP) 390 391 392 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { 393 ; KNL_64-LABEL: test9: 394 ; KNL_64: # BB#0: # %entry 395 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 396 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 397 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 398 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 399 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 400 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 401 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 402 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 403 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 404 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 405 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 406 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 407 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 408 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 409 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 410 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 411 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 412 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1 413 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 414 ; KNL_64-NEXT: retq 415 ; 416 ; KNL_32-LABEL: test9: 417 ; KNL_32: # BB#0: # %entry 418 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 419 ; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3 420 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 421 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 422 ; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3 423 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 424 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 425 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 426 ; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1 427 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 428 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 429 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1 430 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 431 ; KNL_32-NEXT: retl 432 ; 433 ; SKX-LABEL: test9: 434 ; SKX: # BB#0: # %entry 435 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2 436 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 437 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 438 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 439 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 440 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 441 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 442 ; SKX-NEXT: kxnorw %k1, %k1, %k1 443 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 444 ; SKX-NEXT: retq 445 entry: 446 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 447 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer 448 449 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13> 450 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 451 ret <8 x i32> %res 452 } 453 454 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { 455 ; KNL_64-LABEL: test10: 456 ; KNL_64: # BB#0: # %entry 457 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 458 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 459 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 460 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 461 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 462 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 463 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 464 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 465 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 466 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 467 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 468 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 469 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 470 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 471 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 472 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 473 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 474 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1 475 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 476 ; KNL_64-NEXT: retq 477 ; 478 ; KNL_32-LABEL: test10: 479 ; KNL_32: # BB#0: # %entry 480 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 481 ; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3 482 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 483 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 484 ; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3 485 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 486 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 487 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 488 ; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1 489 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 490 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 491 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1 492 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 493 ; KNL_32-NEXT: retl 494 ; 495 ; SKX-LABEL: test10: 496 ; SKX: # BB#0: # %entry 497 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2 498 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 499 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 500 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 501 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 502 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 503 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 504 ; SKX-NEXT: kxnorw %k1, %k1, %k1 505 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 506 ; SKX-NEXT: retq 507 entry: 508 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 509 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer 510 511 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13 512 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 513 ret <8 x i32> %res 514 } 515 516 ; Splat index in GEP, requires broadcast 517 define <16 x float> @test11(float* %base, i32 %ind) { 518 ; KNL_64-LABEL: test11: 519 ; KNL_64: # BB#0: 520 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1 521 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1 522 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 523 ; KNL_64-NEXT: retq 524 ; 525 ; KNL_32-LABEL: test11: 526 ; KNL_32: # BB#0: 527 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 528 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 529 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1 530 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 531 ; KNL_32-NEXT: retl 532 ; 533 ; SKX-LABEL: test11: 534 ; SKX: # BB#0: 535 ; SKX-NEXT: vpbroadcastd %esi, %zmm1 536 ; SKX-NEXT: kxnorw %k1, %k1, %k1 537 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 538 ; SKX-NEXT: retq 539 540 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 541 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 542 543 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind 544 545 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 546 ret <16 x float>%res 547 } 548 549 ; We are checking the uniform base here. It is taken directly from input to vgatherdps 550 define <16 x float> @test12(float* %base, <16 x i32> %ind) { 551 ; KNL_64-LABEL: test12: 552 ; KNL_64: # BB#0: 553 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1 554 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 555 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 556 ; KNL_64-NEXT: retq 557 ; 558 ; KNL_32-LABEL: test12: 559 ; KNL_32: # BB#0: 560 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 561 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1 562 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 563 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 564 ; KNL_32-NEXT: retl 565 ; 566 ; SKX-LABEL: test12: 567 ; SKX: # BB#0: 568 ; SKX-NEXT: kxnorw %k1, %k1, %k1 569 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 570 ; SKX-NEXT: vmovaps %zmm1, %zmm0 571 ; SKX-NEXT: retq 572 573 %sext_ind = sext <16 x i32> %ind to <16 x i64> 574 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind 575 576 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 577 ret <16 x float>%res 578 } 579 580 ; The same as the previous, but the mask is undefined 581 define <16 x float> @test13(float* %base, <16 x i32> %ind) { 582 ; KNL_64-LABEL: test13: 583 ; KNL_64: # BB#0: 584 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 585 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 586 ; KNL_64-NEXT: retq 587 ; 588 ; KNL_32-LABEL: test13: 589 ; KNL_32: # BB#0: 590 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 591 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 592 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 593 ; KNL_32-NEXT: retl 594 ; 595 ; SKX-LABEL: test13: 596 ; SKX: # BB#0: 597 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 598 ; SKX-NEXT: vmovaps %zmm1, %zmm0 599 ; SKX-NEXT: retq 600 601 %sext_ind = sext <16 x i32> %ind to <16 x i64> 602 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind 603 604 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef) 605 ret <16 x float>%res 606 } 607 608 ; The base pointer is not splat, can't find unform base 609 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { 610 ; KNL_64-LABEL: test14: 611 ; KNL_64: # BB#0: 612 ; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 613 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 614 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 615 ; KNL_64-NEXT: vmovd %esi, %xmm1 616 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 617 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 618 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1 619 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 620 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1 621 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} 622 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} 623 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 624 ; KNL_64-NEXT: retq 625 ; 626 ; KNL_32-LABEL: test14: 627 ; KNL_32: # BB#0: 628 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 629 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 630 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 631 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 632 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 633 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} 634 ; KNL_32-NEXT: retl 635 ; 636 ; SKX-LABEL: test14: 637 ; SKX: # BB#0: 638 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 639 ; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 640 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 641 ; SKX-NEXT: vmovd %esi, %xmm1 642 ; SKX-NEXT: vpbroadcastd %xmm1, %ymm1 643 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 644 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 645 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 646 ; SKX-NEXT: kshiftrw $8, %k0, %k1 647 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} 648 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} 649 ; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0 650 ; SKX-NEXT: retq 651 ; 652 ; SKX_32-LABEL: test14: 653 ; SKX_32: # BB#0: 654 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 655 ; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 656 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 657 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 658 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 659 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} 660 ; SKX_32-NEXT: retl 661 662 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1 663 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 664 665 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind 666 667 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef) 668 ret <16 x float>%res 669 } 670 671 declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) 672 declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>) 673 declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>) 674 675 ; Gather smaller than existing instruction 676 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { 677 ; 678 ; KNL_64-LABEL: test15: 679 ; KNL_64: # BB#0: 680 ; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2 681 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 682 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 683 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0 684 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 685 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1 686 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} 687 ; KNL_64-NEXT: retq 688 ; 689 ; KNL_32-LABEL: test15: 690 ; KNL_32: # BB#0: 691 ; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2 692 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 693 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 694 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 695 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0 696 ; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0 697 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1 698 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} 699 ; KNL_32-NEXT: retl 700 ; 701 ; SKX-LABEL: test15: 702 ; SKX: # BB#0: 703 ; SKX-NEXT: vpmovd2m %xmm1, %k1 704 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} 705 ; SKX-NEXT: vmovaps %zmm1, %zmm0 706 ; SKX-NEXT: retq 707 708 %sext_ind = sext <4 x i32> %ind to <4 x i64> 709 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind 710 %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) 711 ret <4 x float>%res 712 } 713 714 ; Gather smaller than existing instruction 715 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) { 716 ; 717 ; KNL_64-LABEL: test16: 718 ; KNL_64: # BB#0: 719 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 720 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 721 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 722 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 723 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 724 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 725 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 726 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 727 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} 728 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 729 ; KNL_64-NEXT: retq 730 ; 731 ; KNL_32-LABEL: test16: 732 ; KNL_32: # BB#0: 733 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 734 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 735 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 736 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 737 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 738 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 739 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 740 ; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1 741 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 742 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} 743 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 744 ; KNL_32-NEXT: retl 745 ; 746 ; SKX-LABEL: test16: 747 ; SKX: # BB#0: 748 ; SKX-NEXT: vpmovd2m %xmm1, %k1 749 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} 750 ; SKX-NEXT: vmovaps %zmm2, %zmm0 751 ; SKX-NEXT: retq 752 753 %sext_ind = sext <4 x i32> %ind to <4 x i64> 754 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind 755 %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) 756 ret <4 x double>%res 757 } 758 759 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { 760 ; 761 ; KNL_64-LABEL: test17: 762 ; KNL_64: # BB#0: 763 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 764 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 765 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 766 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 767 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} 768 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 769 ; KNL_64-NEXT: retq 770 ; 771 ; KNL_32-LABEL: test17: 772 ; KNL_32: # BB#0: 773 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 774 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 775 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 776 ; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1 777 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 778 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} 779 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 780 ; KNL_32-NEXT: retl 781 ; 782 ; SKX-LABEL: test17: 783 ; SKX: # BB#0: 784 ; SKX-NEXT: vpmovq2m %xmm1, %k1 785 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} 786 ; SKX-NEXT: vmovaps %zmm2, %zmm0 787 ; SKX-NEXT: retq 788 789 %sext_ind = sext <2 x i32> %ind to <2 x i64> 790 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind 791 %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) 792 ret <2 x double>%res 793 } 794 795 declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> ) 796 declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> ) 797 declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> ) 798 declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) 799 declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> ) 800 801 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { 802 ; 803 ; KNL_64-LABEL: test18: 804 ; KNL_64: # BB#0: 805 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 806 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 807 ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 808 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 809 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 810 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 811 ; KNL_64-NEXT: retq 812 ; 813 ; KNL_32-LABEL: test18: 814 ; KNL_32: # BB#0: 815 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 816 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 817 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 818 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 819 ; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2 820 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 821 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 822 ; KNL_32-NEXT: retl 823 ; 824 ; SKX-LABEL: test18: 825 ; SKX: # BB#0: 826 ; SKX-NEXT: vpmovd2m %xmm2, %k1 827 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 828 ; SKX-NEXT: retq 829 call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) 830 ret void 831 } 832 833 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) { 834 ; 835 ; KNL_64-LABEL: test19: 836 ; KNL_64: # BB#0: 837 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 838 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 839 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 840 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 841 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 842 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 843 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 844 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} 845 ; KNL_64-NEXT: retq 846 ; 847 ; KNL_32-LABEL: test19: 848 ; KNL_32: # BB#0: 849 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 850 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 851 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 852 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 853 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 854 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 855 ; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1 856 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 857 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} 858 ; KNL_32-NEXT: retl 859 ; 860 ; SKX-LABEL: test19: 861 ; SKX: # BB#0: 862 ; SKX-NEXT: vpmovd2m %xmm1, %k1 863 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} 864 ; SKX-NEXT: retq 865 ; 866 ; SKX_32-LABEL: test19: 867 ; SKX_32: # BB#0: 868 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1 869 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 870 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} 871 ; SKX_32-NEXT: retl 872 %gep = getelementptr double, double* %ptr, <4 x i64> %ind 873 call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) 874 ret void 875 } 876 877 ; Data type requires widening 878 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { 879 ; 880 ; KNL_64-LABEL: test20: 881 ; KNL_64: # BB#0: 882 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 883 ; KNL_64-NEXT: vmovq %xmm2, %xmm2 884 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 885 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 886 ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 887 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 888 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 889 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} 890 ; KNL_64-NEXT: retq 891 ; 892 ; KNL_32-LABEL: test20: 893 ; KNL_32: # BB#0: 894 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 895 ; KNL_32-NEXT: vmovq %xmm2, %xmm2 896 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 897 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 898 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 899 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 900 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 901 ; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2 902 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 903 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} 904 ; KNL_32-NEXT: retl 905 ; 906 ; SKX-LABEL: test20: 907 ; SKX: # BB#0: 908 ; SKX-NEXT: vpmovq2m %xmm2, %k0 909 ; SKX-NEXT: kshiftlw $2, %k0, %k0 910 ; SKX-NEXT: kshiftrw $2, %k0, %k1 911 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} 912 ; SKX-NEXT: retq 913 call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) 914 ret void 915 } 916 917 ; Data type requires promotion 918 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { 919 ; 920 ; KNL_64-LABEL: test21: 921 ; KNL_64: # BB#0: 922 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 923 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 924 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 925 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 926 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 927 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 928 ; KNL_64-NEXT: retq 929 ; 930 ; KNL_32-LABEL: test21: 931 ; KNL_32: # BB#0: 932 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 933 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 934 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 935 ; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2 936 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 937 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 938 ; KNL_32-NEXT: retl 939 ; 940 ; SKX-LABEL: test21: 941 ; SKX: # BB#0: 942 ; SKX-NEXT: vpmovq2m %xmm2, %k0 943 ; SKX-NEXT: kshiftlw $2, %k0, %k0 944 ; SKX-NEXT: kshiftrw $2, %k0, %k1 945 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 946 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 947 ; SKX-NEXT: retq 948 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) 949 ret void 950 } 951 952 ; The result type requires widening 953 declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>) 954 955 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { 956 ; 957 ; 958 ; KNL_64-LABEL: test22: 959 ; KNL_64: # BB#0: 960 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 961 ; KNL_64-NEXT: vmovq %xmm1, %xmm1 962 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 963 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 964 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 965 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 966 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 967 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 968 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 969 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} 970 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 971 ; KNL_64-NEXT: retq 972 ; 973 ; KNL_32-LABEL: test22: 974 ; KNL_32: # BB#0: 975 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 976 ; KNL_32-NEXT: vmovq %xmm1, %xmm1 977 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 978 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 979 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 980 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 981 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 982 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 983 ; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1 984 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 985 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} 986 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 987 ; KNL_32-NEXT: retl 988 ; 989 ; SKX-LABEL: test22: 990 ; SKX: # BB#0: 991 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 992 ; SKX-NEXT: vpmovq2m %xmm1, %k0 993 ; SKX-NEXT: kshiftlw $2, %k0, %k0 994 ; SKX-NEXT: kshiftrw $2, %k0, %k1 995 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} 996 ; SKX-NEXT: vmovaps %zmm2, %zmm0 997 ; SKX-NEXT: retq 998 %sext_ind = sext <2 x i32> %ind to <2 x i64> 999 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind 1000 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) 1001 ret <2 x float>%res 1002 } 1003 1004 declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) 1005 declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) 1006 1007 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { 1008 ; 1009 ; KNL_64-LABEL: test23: 1010 ; KNL_64: # BB#0: 1011 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 1012 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1013 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 1014 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 1015 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} 1016 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1017 ; KNL_64-NEXT: retq 1018 ; 1019 ; KNL_32-LABEL: test23: 1020 ; KNL_32: # BB#0: 1021 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 1022 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1023 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1024 ; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1 1025 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1026 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} 1027 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1028 ; KNL_32-NEXT: retl 1029 ; 1030 ; SKX-LABEL: test23: 1031 ; SKX: # BB#0: 1032 ; SKX-NEXT: vpmovq2m %xmm1, %k1 1033 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} 1034 ; SKX-NEXT: vmovaps %zmm2, %zmm0 1035 ; SKX-NEXT: retq 1036 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1037 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind 1038 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) 1039 ret <2 x i32>%res 1040 } 1041 1042 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { 1043 ; 1044 ; 1045 ; KNL_64-LABEL: test24: 1046 ; KNL_64: # BB#0: 1047 ; KNL_64-NEXT: movb $3, %al 1048 ; KNL_64-NEXT: movzbl %al, %eax 1049 ; KNL_64-NEXT: kmovw %eax, %k1 1050 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} 1051 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1052 ; KNL_64-NEXT: retq 1053 ; 1054 ; KNL_32-LABEL: test24: 1055 ; KNL_32: # BB#0: 1056 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1057 ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1 1058 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1 1059 ; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1 1060 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1061 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} 1062 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1063 ; KNL_32-NEXT: retl 1064 ; 1065 ; SKX-LABEL: test24: 1066 ; SKX: # BB#0: 1067 ; SKX-NEXT: kxnorw %k1, %k1, %k1 1068 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} 1069 ; SKX-NEXT: vmovaps %zmm1, %zmm0 1070 ; SKX-NEXT: retq 1071 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1072 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind 1073 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) 1074 ret <2 x i32>%res 1075 } 1076 1077 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) { 1078 ; 1079 ; KNL_64-LABEL: test25: 1080 ; KNL_64: # BB#0: 1081 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 1082 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1083 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 1084 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 1085 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} 1086 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1087 ; KNL_64-NEXT: retq 1088 ; 1089 ; KNL_32-LABEL: test25: 1090 ; KNL_32: # BB#0: 1091 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 1092 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1093 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1094 ; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1 1095 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1096 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} 1097 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1098 ; KNL_32-NEXT: retl 1099 ; 1100 ; SKX-LABEL: test25: 1101 ; SKX: # BB#0: 1102 ; SKX-NEXT: vpmovq2m %xmm1, %k1 1103 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} 1104 ; SKX-NEXT: vmovaps %zmm2, %zmm0 1105 ; SKX-NEXT: retq 1106 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1107 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind 1108 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) 1109 ret <2 x i64>%res 1110 } 1111 1112 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { 1113 ; 1114 ; KNL_64-LABEL: test26: 1115 ; KNL_64: # BB#0: 1116 ; KNL_64-NEXT: movb $3, %al 1117 ; KNL_64-NEXT: movzbl %al, %eax 1118 ; KNL_64-NEXT: kmovw %eax, %k1 1119 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} 1120 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1121 ; KNL_64-NEXT: retq 1122 ; 1123 ; KNL_32-LABEL: test26: 1124 ; KNL_32: # BB#0: 1125 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1126 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 1127 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2 1128 ; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2 1129 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 1130 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} 1131 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1132 ; KNL_32-NEXT: retl 1133 ; 1134 ; SKX-LABEL: test26: 1135 ; SKX: # BB#0: 1136 ; SKX-NEXT: kxnorw %k1, %k1, %k1 1137 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} 1138 ; SKX-NEXT: vmovaps %zmm1, %zmm0 1139 ; SKX-NEXT: retq 1140 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1141 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind 1142 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0) 1143 ret <2 x i64>%res 1144 } 1145 1146 ; Result type requires widening; all-ones mask 1147 define <2 x float> @test27(float* %base, <2 x i32> %ind) { 1148 ; 1149 ; KNL_64-LABEL: test27: 1150 ; KNL_64: # BB#0: 1151 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1152 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 1153 ; KNL_64-NEXT: movb $3, %al 1154 ; KNL_64-NEXT: movzbl %al, %eax 1155 ; KNL_64-NEXT: kmovw %eax, %k1 1156 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} 1157 ; KNL_64-NEXT: retq 1158 ; 1159 ; KNL_32-LABEL: test27: 1160 ; KNL_32: # BB#0: 1161 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1162 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1163 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 1164 ; KNL_32-NEXT: movb $3, %cl 1165 ; KNL_32-NEXT: movzbl %cl, %ecx 1166 ; KNL_32-NEXT: kmovw %ecx, %k1 1167 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} 1168 ; KNL_32-NEXT: retl 1169 ; 1170 ; SKX-LABEL: test27: 1171 ; SKX: # BB#0: 1172 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 1173 ; SKX-NEXT: movb $3, %al 1174 ; SKX-NEXT: kmovb %eax, %k1 1175 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} 1176 ; SKX-NEXT: retq 1177 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1178 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind 1179 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef) 1180 ret <2 x float>%res 1181 } 1182 1183 ; Data type requires promotion, mask is all-ones 1184 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { 1185 ; 1186 ; 1187 ; KNL_64-LABEL: test28: 1188 ; KNL_64: # BB#0: 1189 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1190 ; KNL_64-NEXT: movb $3, %al 1191 ; KNL_64-NEXT: movzbl %al, %eax 1192 ; KNL_64-NEXT: kmovw %eax, %k1 1193 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 1194 ; KNL_64-NEXT: retq 1195 ; 1196 ; KNL_32-LABEL: test28: 1197 ; KNL_32: # BB#0: 1198 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1199 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 1200 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2 1201 ; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2 1202 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 1203 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 1204 ; KNL_32-NEXT: retl 1205 ; 1206 ; SKX-LABEL: test28: 1207 ; SKX: # BB#0: 1208 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1209 ; SKX-NEXT: movb $3, %al 1210 ; SKX-NEXT: kmovb %eax, %k1 1211 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1212 ; SKX-NEXT: retq 1213 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>) 1214 ret void 1215 } 1216 1217 1218 ; SCALAR-LABEL: test29 1219 ; SCALAR: extractelement <16 x float*> 1220 ; SCALAR-NEXT: load float 1221 ; SCALAR-NEXT: insertelement <16 x float> 1222 ; SCALAR-NEXT: extractelement <16 x float*> 1223 ; SCALAR-NEXT: load float 1224 1225 define <16 x float> @test29(float* %base, <16 x i32> %ind) { 1226 ; KNL_64-LABEL: test29: 1227 ; KNL_64: # BB#0: 1228 ; KNL_64-NEXT: movw $44, %ax 1229 ; KNL_64-NEXT: kmovw %eax, %k1 1230 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 1231 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1232 ; KNL_64-NEXT: retq 1233 ; 1234 ; KNL_32-LABEL: test29: 1235 ; KNL_32: # BB#0: 1236 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1237 ; KNL_32-NEXT: movw $44, %cx 1238 ; KNL_32-NEXT: kmovw %ecx, %k1 1239 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 1240 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1241 ; KNL_32-NEXT: retl 1242 ; 1243 ; SKX-LABEL: test29: 1244 ; SKX: # BB#0: 1245 ; SKX-NEXT: movw $44, %ax 1246 ; SKX-NEXT: kmovw %eax, %k1 1247 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 1248 ; SKX-NEXT: vmovaps %zmm1, %zmm0 1249 ; SKX-NEXT: retq 1250 1251 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 1252 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 1253 1254 %sext_ind = sext <16 x i32> %ind to <16 x i64> 1255 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 1256 1257 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef) 1258 ret <16 x float>%res 1259 } 1260 1261 ; Check non-power-of-2 case. It should be scalarized. 1262 declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) 1263 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { 1264 ; KNL_64-LABEL: test30: 1265 ; KNL_64: # BB#0: 1266 ; KNL_64-NEXT: andl $1, %edx 1267 ; KNL_64-NEXT: kmovw %edx, %k1 1268 ; KNL_64-NEXT: andl $1, %esi 1269 ; KNL_64-NEXT: kmovw %esi, %k2 1270 ; KNL_64-NEXT: movl %edi, %eax 1271 ; KNL_64-NEXT: andl $1, %eax 1272 ; KNL_64-NEXT: kmovw %eax, %k0 1273 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 1274 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 1275 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1276 ; KNL_64-NEXT: # implicit-def: %XMM0 1277 ; KNL_64-NEXT: testb $1, %dil 1278 ; KNL_64-NEXT: je .LBB29_2 1279 ; KNL_64-NEXT: # BB#1: # %cond.load 1280 ; KNL_64-NEXT: vmovq %xmm1, %rax 1281 ; KNL_64-NEXT: vmovd (%rax), %xmm0 1282 ; KNL_64-NEXT: .LBB29_2: # %else 1283 ; KNL_64-NEXT: kmovw %k2, %eax 1284 ; KNL_64-NEXT: movl %eax, %ecx 1285 ; KNL_64-NEXT: andl $1, %ecx 1286 ; KNL_64-NEXT: testb %cl, %cl 1287 ; KNL_64-NEXT: je .LBB29_4 1288 ; KNL_64-NEXT: # BB#3: # %cond.load1 1289 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx 1290 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 1291 ; KNL_64-NEXT: .LBB29_4: # %else2 1292 ; KNL_64-NEXT: kmovw %k1, %ecx 1293 ; KNL_64-NEXT: movl %ecx, %edx 1294 ; KNL_64-NEXT: andl $1, %edx 1295 ; KNL_64-NEXT: testb %dl, %dl 1296 ; KNL_64-NEXT: je .LBB29_6 1297 ; KNL_64-NEXT: # BB#5: # %cond.load4 1298 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 1299 ; KNL_64-NEXT: vmovq %xmm1, %rdx 1300 ; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 1301 ; KNL_64-NEXT: .LBB29_6: # %else5 1302 ; KNL_64-NEXT: kmovw %k0, %edx 1303 ; KNL_64-NEXT: vmovd %edx, %xmm1 1304 ; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 1305 ; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 1306 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 1307 ; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 1308 ; KNL_64-NEXT: retq 1309 ; 1310 ; KNL_32-LABEL: test30: 1311 ; KNL_32: # BB#0: 1312 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1313 ; KNL_32-NEXT: andl $1, %eax 1314 ; KNL_32-NEXT: kmovw %eax, %k1 1315 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1316 ; KNL_32-NEXT: andl $1, %eax 1317 ; KNL_32-NEXT: kmovw %eax, %k2 1318 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1319 ; KNL_32-NEXT: movl %eax, %ecx 1320 ; KNL_32-NEXT: andl $1, %ecx 1321 ; KNL_32-NEXT: kmovw %ecx, %k0 1322 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 1323 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1324 ; KNL_32-NEXT: # implicit-def: %XMM0 1325 ; KNL_32-NEXT: testb $1, %al 1326 ; KNL_32-NEXT: je .LBB29_2 1327 ; KNL_32-NEXT: # BB#1: # %cond.load 1328 ; KNL_32-NEXT: vmovd %xmm1, %eax 1329 ; KNL_32-NEXT: vmovd (%eax), %xmm0 1330 ; KNL_32-NEXT: .LBB29_2: # %else 1331 ; KNL_32-NEXT: kmovw %k2, %eax 1332 ; KNL_32-NEXT: movl %eax, %ecx 1333 ; KNL_32-NEXT: andl $1, %ecx 1334 ; KNL_32-NEXT: testb %cl, %cl 1335 ; KNL_32-NEXT: je .LBB29_4 1336 ; KNL_32-NEXT: # BB#3: # %cond.load1 1337 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx 1338 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 1339 ; KNL_32-NEXT: .LBB29_4: # %else2 1340 ; KNL_32-NEXT: kmovw %k1, %ecx 1341 ; KNL_32-NEXT: movl %ecx, %edx 1342 ; KNL_32-NEXT: andl $1, %edx 1343 ; KNL_32-NEXT: testb %dl, %dl 1344 ; KNL_32-NEXT: je .LBB29_6 1345 ; KNL_32-NEXT: # BB#5: # %cond.load4 1346 ; KNL_32-NEXT: vpextrd $2, %xmm1, %edx 1347 ; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0 1348 ; KNL_32-NEXT: .LBB29_6: # %else5 1349 ; KNL_32-NEXT: kmovw %k0, %edx 1350 ; KNL_32-NEXT: vmovd %edx, %xmm1 1351 ; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 1352 ; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 1353 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 1354 ; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 1355 ; KNL_32-NEXT: retl 1356 ; 1357 ; SKX-LABEL: test30: 1358 ; SKX: # BB#0: 1359 ; SKX-NEXT: vpmovd2m %xmm2, %k1 1360 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1361 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 1362 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 1363 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1364 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1365 ; SKX-NEXT: # implicit-def: %XMM0 1366 ; SKX-NEXT: andb $1, %al 1367 ; SKX-NEXT: je .LBB29_2 1368 ; SKX-NEXT: # BB#1: # %cond.load 1369 ; SKX-NEXT: vmovq %xmm1, %rax 1370 ; SKX-NEXT: vmovd (%rax), %xmm0 1371 ; SKX-NEXT: .LBB29_2: # %else 1372 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1373 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1374 ; SKX-NEXT: andb $1, %al 1375 ; SKX-NEXT: je .LBB29_4 1376 ; SKX-NEXT: # BB#3: # %cond.load1 1377 ; SKX-NEXT: vpextrq $1, %xmm1, %rax 1378 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 1379 ; SKX-NEXT: .LBB29_4: # %else2 1380 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1381 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1382 ; SKX-NEXT: andb $1, %al 1383 ; SKX-NEXT: je .LBB29_6 1384 ; SKX-NEXT: # BB#5: # %cond.load4 1385 ; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1 1386 ; SKX-NEXT: vmovq %xmm1, %rax 1387 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0 1388 ; SKX-NEXT: .LBB29_6: # %else5 1389 ; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1} 1390 ; SKX-NEXT: vmovaps %zmm3, %zmm0 1391 ; SKX-NEXT: retq 1392 1393 %sext_ind = sext <3 x i32> %ind to <3 x i64> 1394 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind 1395 %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) 1396 ret <3 x i32>%res 1397 } 1398 1399 declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>) 1400 1401 ; KNL-LABEL: test31 1402 ; KNL: vpgatherqq 1403 ; KNL: vpgatherqq 1404 define <16 x float*> @test31(<16 x float**> %ptrs) { 1405 ; KNL_64-LABEL: test31: 1406 ; KNL_64: # BB#0: 1407 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1 1408 ; KNL_64-NEXT: kxnorw %k2, %k2, %k2 1409 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} 1410 ; KNL_64-NEXT: kshiftrw $8, %k1, %k1 1411 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} 1412 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1413 ; KNL_64-NEXT: vmovaps %zmm3, %zmm1 1414 ; KNL_64-NEXT: retq 1415 ; 1416 ; KNL_32-LABEL: test31: 1417 ; KNL_32: # BB#0: 1418 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1 1419 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 1420 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1421 ; KNL_32-NEXT: retl 1422 ; 1423 ; SKX-LABEL: test31: 1424 ; SKX: # BB#0: 1425 ; SKX-NEXT: kxnorw %k1, %k1, %k1 1426 ; SKX-NEXT: kxnorw %k2, %k2, %k2 1427 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} 1428 ; SKX-NEXT: kshiftrw $8, %k1, %k1 1429 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} 1430 ; SKX-NEXT: vmovaps %zmm2, %zmm0 1431 ; SKX-NEXT: vmovaps %zmm3, %zmm1 1432 ; SKX-NEXT: retq 1433 ; 1434 ; SKX_32-LABEL: test31: 1435 ; SKX_32: # BB#0: 1436 ; SKX_32-NEXT: kxnorw %k1, %k1, %k1 1437 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 1438 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 1439 ; SKX_32-NEXT: retl 1440 1441 %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef) 1442 ret <16 x float*>%res 1443 } 1444 1445 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 1446 ; KNL_64-LABEL: test_gather_16i32: 1447 ; KNL_64: # BB#0: 1448 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1449 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1450 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1451 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2 1452 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1453 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 1454 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 1455 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 1456 ; KNL_64-NEXT: retq 1457 ; 1458 ; KNL_32-LABEL: test_gather_16i32: 1459 ; KNL_32: # BB#0: 1460 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1461 ; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1 1462 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1463 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 1464 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1465 ; KNL_32-NEXT: retl 1466 ; 1467 ; SKX-LABEL: test_gather_16i32: 1468 ; SKX: # BB#0: 1469 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1470 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1471 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1472 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2 1473 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1474 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 1475 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 1476 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 1477 ; SKX-NEXT: retq 1478 ; 1479 ; SKX_32-LABEL: test_gather_16i32: 1480 ; SKX_32: # BB#0: 1481 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1482 ; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1 1483 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1484 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 1485 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1486 ; SKX_32-NEXT: retl 1487 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) 1488 ret <16 x i32> %res 1489 } 1490 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 1491 ; KNL_64-LABEL: test_gather_16i64: 1492 ; KNL_64: # BB#0: 1493 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1494 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1495 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1496 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1497 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 1498 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 1499 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0 1500 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1 1501 ; KNL_64-NEXT: retq 1502 ; 1503 ; KNL_32-LABEL: test_gather_16i64: 1504 ; KNL_32: # BB#0: 1505 ; KNL_32-NEXT: pushl %ebp 1506 ; KNL_32-NEXT: .Ltmp0: 1507 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1508 ; KNL_32-NEXT: .Ltmp1: 1509 ; KNL_32-NEXT: .cfi_offset %ebp, -8 1510 ; KNL_32-NEXT: movl %esp, %ebp 1511 ; KNL_32-NEXT: .Ltmp2: 1512 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1513 ; KNL_32-NEXT: andl $-64, %esp 1514 ; KNL_32-NEXT: subl $64, %esp 1515 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1516 ; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1 1517 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1518 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1519 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1520 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} 1521 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1522 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} 1523 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1524 ; KNL_32-NEXT: movl %ebp, %esp 1525 ; KNL_32-NEXT: popl %ebp 1526 ; KNL_32-NEXT: retl 1527 ; 1528 ; SKX-LABEL: test_gather_16i64: 1529 ; SKX: # BB#0: 1530 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1531 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1532 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1533 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1534 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 1535 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 1536 ; SKX-NEXT: vmovaps %zmm3, %zmm0 1537 ; SKX-NEXT: vmovaps %zmm4, %zmm1 1538 ; SKX-NEXT: retq 1539 %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) 1540 ret <16 x i64> %res 1541 } 1542 declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) 1543 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 1544 ; KNL_64-LABEL: test_gather_16f32: 1545 ; KNL_64: # BB#0: 1546 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1547 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1548 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1549 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2 1550 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1551 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 1552 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 1553 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0 1554 ; KNL_64-NEXT: retq 1555 ; 1556 ; KNL_32-LABEL: test_gather_16f32: 1557 ; KNL_32: # BB#0: 1558 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1559 ; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1 1560 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1561 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} 1562 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1563 ; KNL_32-NEXT: retl 1564 ; 1565 ; SKX-LABEL: test_gather_16f32: 1566 ; SKX: # BB#0: 1567 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1568 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1569 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1570 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2 1571 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1572 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 1573 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 1574 ; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0 1575 ; SKX-NEXT: retq 1576 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) 1577 ret <16 x float> %res 1578 } 1579 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 1580 ; KNL_64-LABEL: test_gather_16f64: 1581 ; KNL_64: # BB#0: 1582 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1583 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1584 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1585 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1586 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 1587 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 1588 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0 1589 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1 1590 ; KNL_64-NEXT: retq 1591 ; 1592 ; KNL_32-LABEL: test_gather_16f64: 1593 ; KNL_32: # BB#0: 1594 ; KNL_32-NEXT: pushl %ebp 1595 ; KNL_32-NEXT: .Ltmp3: 1596 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1597 ; KNL_32-NEXT: .Ltmp4: 1598 ; KNL_32-NEXT: .cfi_offset %ebp, -8 1599 ; KNL_32-NEXT: movl %esp, %ebp 1600 ; KNL_32-NEXT: .Ltmp5: 1601 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1602 ; KNL_32-NEXT: andl $-64, %esp 1603 ; KNL_32-NEXT: subl $64, %esp 1604 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1605 ; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1 1606 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1607 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 1608 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1609 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} 1610 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1611 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} 1612 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1613 ; KNL_32-NEXT: movl %ebp, %esp 1614 ; KNL_32-NEXT: popl %ebp 1615 ; KNL_32-NEXT: retl 1616 ; 1617 ; SKX-LABEL: test_gather_16f64: 1618 ; SKX: # BB#0: 1619 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1620 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1621 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1622 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1623 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 1624 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 1625 ; SKX-NEXT: vmovaps %zmm3, %zmm0 1626 ; SKX-NEXT: vmovaps %zmm4, %zmm1 1627 ; SKX-NEXT: retq 1628 %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) 1629 ret <16 x double> %res 1630 } 1631 declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) 1632 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 1633 ; KNL_64-LABEL: test_scatter_16i32: 1634 ; KNL_64: # BB#0: 1635 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1636 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1637 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1638 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1639 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 1640 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1641 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 1642 ; KNL_64-NEXT: retq 1643 ; 1644 ; KNL_32-LABEL: test_scatter_16i32: 1645 ; KNL_32: # BB#0: 1646 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1647 ; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1 1648 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1649 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 1650 ; KNL_32-NEXT: retl 1651 ; 1652 ; SKX-LABEL: test_scatter_16i32: 1653 ; SKX: # BB#0: 1654 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1655 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1656 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1657 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1658 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 1659 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0 1660 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 1661 ; SKX-NEXT: retq 1662 ; 1663 ; SKX_32-LABEL: test_scatter_16i32: 1664 ; SKX_32: # BB#0: 1665 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1666 ; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1 1667 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1668 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 1669 ; SKX_32-NEXT: retl 1670 call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask) 1671 ret void 1672 } 1673 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 1674 ; KNL_64-LABEL: test_scatter_16i64: 1675 ; KNL_64: # BB#0: 1676 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1677 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1678 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1679 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1680 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 1681 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 1682 ; KNL_64-NEXT: retq 1683 ; 1684 ; KNL_32-LABEL: test_scatter_16i64: 1685 ; KNL_32: # BB#0: 1686 ; KNL_32-NEXT: pushl %ebp 1687 ; KNL_32-NEXT: .Ltmp6: 1688 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1689 ; KNL_32-NEXT: .Ltmp7: 1690 ; KNL_32-NEXT: .cfi_offset %ebp, -8 1691 ; KNL_32-NEXT: movl %esp, %ebp 1692 ; KNL_32-NEXT: .Ltmp8: 1693 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1694 ; KNL_32-NEXT: andl $-64, %esp 1695 ; KNL_32-NEXT: subl $64, %esp 1696 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1697 ; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1 1698 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1699 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1700 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1701 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} 1702 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1703 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} 1704 ; KNL_32-NEXT: movl %ebp, %esp 1705 ; KNL_32-NEXT: popl %ebp 1706 ; KNL_32-NEXT: retl 1707 ; 1708 ; SKX-LABEL: test_scatter_16i64: 1709 ; SKX: # BB#0: 1710 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1711 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1712 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1713 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1714 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 1715 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 1716 ; SKX-NEXT: retq 1717 call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) 1718 ret void 1719 } 1720 declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask) 1721 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 1722 ; KNL_64-LABEL: test_scatter_16f32: 1723 ; KNL_64: # BB#0: 1724 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1725 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1726 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1727 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1728 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 1729 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0 1730 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 1731 ; KNL_64-NEXT: retq 1732 ; 1733 ; KNL_32-LABEL: test_scatter_16f32: 1734 ; KNL_32: # BB#0: 1735 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1736 ; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1 1737 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1738 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} 1739 ; KNL_32-NEXT: retl 1740 ; 1741 ; SKX-LABEL: test_scatter_16f32: 1742 ; SKX: # BB#0: 1743 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1744 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1745 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1746 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1747 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 1748 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0 1749 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 1750 ; SKX-NEXT: retq 1751 call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) 1752 ret void 1753 } 1754 declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask) 1755 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 1756 ; KNL_64-LABEL: test_scatter_16f64: 1757 ; KNL_64: # BB#0: 1758 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1759 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1760 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1761 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1762 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 1763 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 1764 ; KNL_64-NEXT: retq 1765 ; 1766 ; KNL_32-LABEL: test_scatter_16f64: 1767 ; KNL_32: # BB#0: 1768 ; KNL_32-NEXT: pushl %ebp 1769 ; KNL_32-NEXT: .Ltmp9: 1770 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1771 ; KNL_32-NEXT: .Ltmp10: 1772 ; KNL_32-NEXT: .cfi_offset %ebp, -8 1773 ; KNL_32-NEXT: movl %esp, %ebp 1774 ; KNL_32-NEXT: .Ltmp11: 1775 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1776 ; KNL_32-NEXT: andl $-64, %esp 1777 ; KNL_32-NEXT: subl $64, %esp 1778 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1779 ; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1 1780 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1781 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 1782 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1783 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} 1784 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1785 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} 1786 ; KNL_32-NEXT: movl %ebp, %esp 1787 ; KNL_32-NEXT: popl %ebp 1788 ; KNL_32-NEXT: retl 1789 ; 1790 ; SKX-LABEL: test_scatter_16f64: 1791 ; SKX: # BB#0: 1792 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1793 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1794 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1795 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1796 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 1797 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 1798 ; SKX-NEXT: retq 1799 call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) 1800 ret void 1801 } 1802 declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask) 1803