1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64 2 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32 3 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX 4 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32 5 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR 6 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null 7 8 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 9 target triple = "x86_64-unknown-linux-gnu" 10 11 12 ; SCALAR-LABEL: test1 13 ; SCALAR: extractelement <16 x float*> 14 ; SCALAR-NEXT: load float 15 ; SCALAR-NEXT: insertelement <16 x float> 16 ; SCALAR-NEXT: extractelement <16 x float*> 17 ; SCALAR-NEXT: load float 18 19 define <16 x float> @test1(float* %base, <16 x i32> %ind) { 20 ; KNL_64-LABEL: test1: 21 ; KNL_64: # BB#0: 22 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 23 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 24 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 25 ; KNL_64-NEXT: retq 26 ; 27 ; KNL_32-LABEL: test1: 28 ; KNL_32: # BB#0: 29 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 30 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 31 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 32 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 33 ; KNL_32-NEXT: retl 34 ; 35 ; SKX-LABEL: test1: 36 ; SKX: # BB#0: 37 ; SKX-NEXT: kxnorw %k0, %k0, %k1 38 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 39 ; SKX-NEXT: vmovaps %zmm1, %zmm0 40 ; SKX-NEXT: retq 41 42 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 43 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 44 45 %sext_ind = sext <16 x i32> %ind to <16 x i64> 46 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 47 48 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 49 ret <16 x float>%res 50 } 51 52 declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) 53 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>) 54 declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> ) 55 56 57 ; SCALAR-LABEL: test2 58 ; SCALAR: extractelement <16 x float*> 59 ; SCALAR-NEXT: load float 60 ; SCALAR-NEXT: insertelement <16 x float> 61 ; SCALAR-NEXT: br label %else 62 ; SCALAR: else: 63 ; SCALAR-NEXT: %res.phi.else = phi 64 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 65 ; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true 66 ; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2 67 68 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { 69 ; KNL_64-LABEL: test2: 70 ; KNL_64: # BB#0: 71 ; KNL_64-NEXT: kmovw %esi, %k1 72 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 73 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 74 ; KNL_64-NEXT: retq 75 ; 76 ; KNL_32-LABEL: test2: 77 ; KNL_32: # BB#0: 78 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 79 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 80 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 81 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 82 ; KNL_32-NEXT: retl 83 ; 84 ; SKX-LABEL: test2: 85 ; SKX: # BB#0: 86 ; SKX-NEXT: kmovw %esi, %k1 87 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 88 ; SKX-NEXT: vmovaps %zmm1, %zmm0 89 ; SKX-NEXT: retq 90 91 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 92 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 93 94 %sext_ind = sext <16 x i32> %ind to <16 x i64> 95 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 96 %imask = bitcast i16 %mask to <16 x i1> 97 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef) 98 ret <16 x float> %res 99 } 100 101 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { 102 ; KNL_64-LABEL: test3: 103 ; KNL_64: # BB#0: 104 ; KNL_64-NEXT: kmovw %esi, %k1 105 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 106 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 107 ; KNL_64-NEXT: retq 108 ; 109 ; KNL_32-LABEL: test3: 110 ; KNL_32: # BB#0: 111 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 112 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 113 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} 114 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 115 ; KNL_32-NEXT: retl 116 ; 117 ; SKX-LABEL: test3: 118 ; SKX: # BB#0: 119 ; SKX-NEXT: kmovw %esi, %k1 120 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 121 ; SKX-NEXT: vmovaps %zmm1, %zmm0 122 ; SKX-NEXT: retq 123 124 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 125 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 126 127 %sext_ind = sext <16 x i32> %ind to <16 x i64> 128 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind 129 %imask = bitcast i16 %mask to <16 x i1> 130 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 131 ret <16 x i32> %res 132 } 133 134 135 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { 136 ; KNL_64-LABEL: test4: 137 ; KNL_64: # BB#0: 138 ; KNL_64-NEXT: kmovw %esi, %k1 139 ; KNL_64-NEXT: kmovw %k1, %k2 140 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 141 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2 142 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 143 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0 144 ; KNL_64-NEXT: retq 145 ; 146 ; KNL_32-LABEL: test4: 147 ; KNL_32: # BB#0: 148 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 149 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 150 ; KNL_32-NEXT: kmovw %k1, %k2 151 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} 152 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2 153 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} 154 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 155 ; KNL_32-NEXT: retl 156 ; 157 ; SKX-LABEL: test4: 158 ; SKX: # BB#0: 159 ; SKX-NEXT: kmovw %esi, %k1 160 ; SKX-NEXT: kmovw %k1, %k2 161 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 162 ; SKX-NEXT: vmovaps %zmm1, %zmm2 163 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 164 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0 165 ; SKX-NEXT: retq 166 167 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 168 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 169 170 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind 171 %imask = bitcast i16 %mask to <16 x i1> 172 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 173 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 174 %res = add <16 x i32> %gt1, %gt2 175 ret <16 x i32> %res 176 } 177 178 179 ; SCALAR-LABEL: test5 180 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0 181 ; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true 182 ; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else 183 ; SCALAR: cond.store: 184 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0 185 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0 186 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4 187 ; SCALAR-NEXT: br label %else 188 ; SCALAR: else: 189 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 190 ; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true 191 ; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2 192 193 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { 194 ; KNL_64-LABEL: test5: 195 ; KNL_64: # BB#0: 196 ; KNL_64-NEXT: kmovw %esi, %k1 197 ; KNL_64-NEXT: kmovw %k1, %k2 198 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 199 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 200 ; KNL_64-NEXT: retq 201 ; 202 ; KNL_32-LABEL: test5: 203 ; KNL_32: # BB#0: 204 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 205 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 206 ; KNL_32-NEXT: kmovw %k1, %k2 207 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} 208 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} 209 ; KNL_32-NEXT: retl 210 ; 211 ; SKX-LABEL: test5: 212 ; SKX: # BB#0: 213 ; SKX-NEXT: kmovw %esi, %k1 214 ; SKX-NEXT: kmovw %k1, %k2 215 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 216 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 217 ; SKX-NEXT: retq 218 219 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 220 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 221 222 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind 223 %imask = bitcast i16 %mask to <16 x i1> 224 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) 225 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) 226 ret void 227 } 228 229 declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) 230 declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> ) 231 232 233 ; SCALAR-LABEL: test6 234 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4 235 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1 236 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1 237 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4 238 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2 239 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2 240 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4 241 242 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { 243 ; KNL_64-LABEL: test6: 244 ; KNL_64: # BB#0: 245 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 246 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2 247 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 248 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 249 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 250 ; KNL_64-NEXT: retq 251 ; 252 ; KNL_32-LABEL: test6: 253 ; KNL_32: # BB#0: 254 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 255 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2 256 ; KNL_32-NEXT: kxnorw %k0, %k0, %k2 257 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} 258 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} 259 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 260 ; KNL_32-NEXT: retl 261 ; 262 ; SKX-LABEL: test6: 263 ; SKX: # BB#0: 264 ; SKX-NEXT: kxnorw %k0, %k0, %k1 265 ; SKX-NEXT: kxnorw %k0, %k0, %k2 266 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 267 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 268 ; SKX-NEXT: vmovaps %zmm2, %zmm0 269 ; SKX-NEXT: retq 270 271 %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 272 273 call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 274 ret <8 x i32>%a 275 } 276 277 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { 278 ; 279 ; KNL_64-LABEL: test7: 280 ; KNL_64: # BB#0: 281 ; KNL_64-NEXT: kmovw %esi, %k1 282 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 283 ; KNL_64-NEXT: kmovw %k1, %k2 284 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} 285 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2 286 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} 287 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 288 ; KNL_64-NEXT: retq 289 ; 290 ; KNL_32-LABEL: test7: 291 ; KNL_32: # BB#0: 292 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 293 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 294 ; KNL_32-NEXT: kmovw %ecx, %k1 295 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 296 ; KNL_32-NEXT: kmovw %k1, %k2 297 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2} 298 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2 299 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} 300 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 301 ; KNL_32-NEXT: retl 302 ; 303 ; SKX-LABEL: test7: 304 ; SKX: # BB#0: 305 ; SKX-NEXT: kmovb %esi, %k1 306 ; SKX-NEXT: kmovw %k1, %k2 307 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2} 308 ; SKX-NEXT: vmovaps %zmm1, %zmm2 309 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1} 310 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0 311 ; SKX-NEXT: retq 312 313 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0 314 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer 315 316 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind 317 %imask = bitcast i8 %mask to <8 x i1> 318 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef) 319 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1) 320 %res = add <8 x i32> %gt1, %gt2 321 ret <8 x i32> %res 322 } 323 324 ; No uniform base in this case, index <8 x i64> contains addresses, 325 ; each gather call will be split into two 326 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { 327 ; KNL_64-LABEL: test8: 328 ; KNL_64: # BB#0: 329 ; KNL_64-NEXT: kmovw %edi, %k1 330 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 331 ; KNL_64-NEXT: kmovw %k2, %k3 332 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} 333 ; KNL_64-NEXT: kmovw %k1, %k3 334 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} 335 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 336 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 337 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 338 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 339 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 340 ; KNL_64-NEXT: retq 341 ; 342 ; KNL_32-LABEL: test8: 343 ; KNL_32: # BB#0: 344 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 345 ; KNL_32-NEXT: kmovw %k1, %k2 346 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 347 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2 348 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 349 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 350 ; KNL_32-NEXT: retl 351 ; 352 ; SKX-LABEL: test8: 353 ; SKX: # BB#0: 354 ; SKX-NEXT: kmovw %edi, %k1 355 ; SKX-NEXT: kshiftrw $8, %k1, %k2 356 ; SKX-NEXT: kmovw %k2, %k3 357 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} 358 ; SKX-NEXT: kmovw %k1, %k3 359 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} 360 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4 361 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 362 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 363 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 364 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 365 ; SKX-NEXT: retq 366 ; 367 ; SKX_32-LABEL: test8: 368 ; SKX_32: # BB#0: 369 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 370 ; SKX_32-NEXT: kmovw %k1, %k2 371 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 372 ; SKX_32-NEXT: vmovaps %zmm1, %zmm2 373 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 374 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 375 ; SKX_32-NEXT: retl 376 377 %imask = bitcast i16 %mask to <16 x i1> 378 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 379 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 380 %res = add <16 x i32> %gt1, %gt2 381 ret <16 x i32> %res 382 } 383 384 %struct.RT = type { i8, [10 x [20 x i32]], i8 } 385 %struct.ST = type { i32, double, %struct.RT } 386 387 ; Masked gather for agregate types 388 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP) 389 390 391 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { 392 ; KNL_64-LABEL: test9: 393 ; KNL_64: # BB#0: # %entry 394 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 395 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 396 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 397 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 398 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 399 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 400 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 401 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 402 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 403 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 404 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 405 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 406 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 407 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 408 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 409 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 410 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 411 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 412 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 413 ; KNL_64-NEXT: retq 414 ; 415 ; KNL_32-LABEL: test9: 416 ; KNL_32: # BB#0: # %entry 417 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 418 ; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3 419 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 420 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 421 ; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3 422 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 423 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 424 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 425 ; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1 426 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 427 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 428 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 429 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 430 ; KNL_32-NEXT: retl 431 ; 432 ; SKX-LABEL: test9: 433 ; SKX: # BB#0: # %entry 434 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2 435 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 436 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 437 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 438 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 439 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 440 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 441 ; SKX-NEXT: kxnorw %k0, %k0, %k1 442 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 443 ; SKX-NEXT: retq 444 entry: 445 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 446 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer 447 448 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13> 449 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 450 ret <8 x i32> %res 451 } 452 453 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { 454 ; KNL_64-LABEL: test10: 455 ; KNL_64: # BB#0: # %entry 456 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 457 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 458 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 459 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 460 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 461 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 462 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 463 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 464 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 465 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 466 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 467 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 468 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 469 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 470 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 471 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 472 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 473 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 474 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 475 ; KNL_64-NEXT: retq 476 ; 477 ; KNL_32-LABEL: test10: 478 ; KNL_32: # BB#0: # %entry 479 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 480 ; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3 481 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 482 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 483 ; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3 484 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 485 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 486 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 487 ; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1 488 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 489 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 490 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 491 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 492 ; KNL_32-NEXT: retl 493 ; 494 ; SKX-LABEL: test10: 495 ; SKX: # BB#0: # %entry 496 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2 497 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 498 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 499 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 500 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 501 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 502 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 503 ; SKX-NEXT: kxnorw %k0, %k0, %k1 504 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 505 ; SKX-NEXT: retq 506 entry: 507 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 508 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer 509 510 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13 511 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 512 ret <8 x i32> %res 513 } 514 515 ; Splat index in GEP, requires broadcast 516 define <16 x float> @test11(float* %base, i32 %ind) { 517 ; KNL_64-LABEL: test11: 518 ; KNL_64: # BB#0: 519 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1 520 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 521 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 522 ; KNL_64-NEXT: retq 523 ; 524 ; KNL_32-LABEL: test11: 525 ; KNL_32: # BB#0: 526 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 527 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 528 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 529 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 530 ; KNL_32-NEXT: retl 531 ; 532 ; SKX-LABEL: test11: 533 ; SKX: # BB#0: 534 ; SKX-NEXT: vpbroadcastd %esi, %zmm1 535 ; SKX-NEXT: kxnorw %k0, %k0, %k1 536 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 537 ; SKX-NEXT: retq 538 539 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 540 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 541 542 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind 543 544 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 545 ret <16 x float>%res 546 } 547 548 ; We are checking the uniform base here. It is taken directly from input to vgatherdps 549 define <16 x float> @test12(float* %base, <16 x i32> %ind) { 550 ; KNL_64-LABEL: test12: 551 ; KNL_64: # BB#0: 552 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 553 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 554 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 555 ; KNL_64-NEXT: retq 556 ; 557 ; KNL_32-LABEL: test12: 558 ; KNL_32: # BB#0: 559 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 560 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 561 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 562 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 563 ; KNL_32-NEXT: retl 564 ; 565 ; SKX-LABEL: test12: 566 ; SKX: # BB#0: 567 ; SKX-NEXT: kxnorw %k0, %k0, %k1 568 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 569 ; SKX-NEXT: vmovaps %zmm1, %zmm0 570 ; SKX-NEXT: retq 571 572 %sext_ind = sext <16 x i32> %ind to <16 x i64> 573 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind 574 575 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 576 ret <16 x float>%res 577 } 578 579 ; The same as the previous, but the mask is undefined 580 define <16 x float> @test13(float* %base, <16 x i32> %ind) { 581 ; KNL_64-LABEL: test13: 582 ; KNL_64: # BB#0: 583 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 584 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 585 ; KNL_64-NEXT: retq 586 ; 587 ; KNL_32-LABEL: test13: 588 ; KNL_32: # BB#0: 589 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 590 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 591 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 592 ; KNL_32-NEXT: retl 593 ; 594 ; SKX-LABEL: test13: 595 ; SKX: # BB#0: 596 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 597 ; SKX-NEXT: vmovaps %zmm1, %zmm0 598 ; SKX-NEXT: retq 599 600 %sext_ind = sext <16 x i32> %ind to <16 x i64> 601 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind 602 603 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef) 604 ret <16 x float>%res 605 } 606 607 ; The base pointer is not splat, can't find unform base 608 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { 609 ; KNL_64-LABEL: test14: 610 ; KNL_64: # BB#0: 611 ; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 612 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 613 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 614 ; KNL_64-NEXT: vmovd %esi, %xmm1 615 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 616 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 617 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1 618 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 619 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1 620 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} 621 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} 622 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 623 ; KNL_64-NEXT: retq 624 ; 625 ; KNL_32-LABEL: test14: 626 ; KNL_32: # BB#0: 627 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 628 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 629 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 630 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 631 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 632 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} 633 ; KNL_32-NEXT: retl 634 ; 635 ; SKX-LABEL: test14: 636 ; SKX: # BB#0: 637 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 638 ; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 639 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 640 ; SKX-NEXT: vpbroadcastd %esi, %ymm1 641 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 642 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 643 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 644 ; SKX-NEXT: kshiftrw $8, %k0, %k1 645 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} 646 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} 647 ; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0 648 ; SKX-NEXT: retq 649 ; 650 ; SKX_32-LABEL: test14: 651 ; SKX_32: # BB#0: 652 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 653 ; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 654 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 655 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 656 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 657 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} 658 ; SKX_32-NEXT: retl 659 660 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1 661 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 662 663 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind 664 665 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef) 666 ret <16 x float>%res 667 } 668 669 declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) 670 declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>) 671 declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>) 672 673 ; Gather smaller than existing instruction 674 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { 675 ; 676 ; KNL_64-LABEL: test15: 677 ; KNL_64: # BB#0: 678 ; KNL_64: vpxor %ymm2, %ymm2, %ymm2 679 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 680 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 681 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0 682 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 683 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} 684 ; KNL_64-NEXT: # kill 685 ; KNL_64-NEXT: retq 686 ; 687 ; KNL_32-LABEL: test15: 688 ; KNL_32: # BB#0: 689 ; KNL_32: vpxor %ymm2, %ymm2, %ymm2 690 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 691 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 692 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 693 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0 694 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 695 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} 696 ; KNL_32-NEXT: # kill 697 ; KNL_32-NEXT: retl 698 ; 699 ; SKX-LABEL: test15: 700 ; SKX: # BB#0: 701 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 702 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 703 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} 704 ; SKX-NEXT: vmovaps %zmm1, %zmm0 705 ; SKX-NEXT: retq 706 ; 707 ; SKX_32-LABEL: test15: 708 ; SKX_32: # BB#0: 709 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 710 ; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 711 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 712 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} 713 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 714 ; SKX_32-NEXT: retl 715 716 %sext_ind = sext <4 x i32> %ind to <4 x i64> 717 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind 718 %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) 719 ret <4 x float>%res 720 } 721 722 ; Gather smaller than existing instruction 723 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) { 724 ; 725 ; KNL_64-LABEL: test16: 726 ; KNL_64: # BB#0: 727 ; KNL_64: vpslld $31, %xmm1, %xmm1 728 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 729 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 730 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 731 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 732 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 733 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 734 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 735 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} 736 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 737 ; KNL_64-NEXT: retq 738 ; 739 ; KNL_32-LABEL: test16: 740 ; KNL_32: # BB#0: 741 ; KNL_32: vpslld $31, %xmm1, %xmm1 742 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 743 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 744 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 745 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 746 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 747 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 748 ; KNL_32-NEXT: vpsllvq .LCPI15_0, %zmm1, %zmm1 749 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 750 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} 751 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 752 ; KNL_32-NEXT: retl 753 ; 754 ; SKX-LABEL: test16: 755 ; SKX: # BB#0: 756 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 757 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 758 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} 759 ; SKX-NEXT: vmovaps %zmm2, %zmm0 760 ; SKX-NEXT: retq 761 ; 762 ; SKX_32-LABEL: test16: 763 ; SKX_32: # BB#0: 764 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 765 ; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 766 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 767 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1} 768 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 769 ; SKX_32-NEXT: retl 770 771 %sext_ind = sext <4 x i32> %ind to <4 x i64> 772 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind 773 %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) 774 ret <4 x double>%res 775 } 776 777 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { 778 ; 779 ; KNL_64-LABEL: test17: 780 ; KNL_64: # BB#0: 781 ; KNL_64: vpxord %zmm3, %zmm3, %zmm3 782 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 783 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 784 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 785 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} 786 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 787 ; KNL_64-NEXT: retq 788 ; 789 ; KNL_32-LABEL: test17: 790 ; KNL_32: # BB#0: 791 ; KNL_32: vpxord %zmm3, %zmm3, %zmm3 792 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 793 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 794 ; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1 795 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 796 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} 797 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 798 ; KNL_32-NEXT: retl 799 ; 800 ; SKX-LABEL: test17: 801 ; SKX: # BB#0: 802 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 803 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 804 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} 805 ; SKX-NEXT: vmovaps %zmm2, %zmm0 806 ; SKX-NEXT: retq 807 ; 808 ; SKX_32-LABEL: test17: 809 ; SKX_32: # BB#0: 810 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 811 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 812 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 813 ; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1} 814 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 815 ; SKX_32-NEXT: retl 816 817 %sext_ind = sext <2 x i32> %ind to <2 x i64> 818 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind 819 %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) 820 ret <2 x double>%res 821 } 822 823 declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> ) 824 declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> ) 825 declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> ) 826 declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) 827 declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> ) 828 829 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { 830 ; 831 ; KNL_64-LABEL: test18: 832 ; KNL_64: # BB#0: 833 ; KNL_64: vpxor %ymm3, %ymm3, %ymm3 834 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 835 ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 836 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 837 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 838 ; KNL_64-NEXT: retq 839 ; 840 ; KNL_32-LABEL: test18: 841 ; KNL_32: # BB#0: 842 ; KNL_32: vpxor %ymm3, %ymm3, %ymm3 843 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 844 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 845 ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 846 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 847 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 848 ; KNL_32-NEXT: retl 849 ; 850 ; SKX-LABEL: test18: 851 ; SKX: # BB#0: 852 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2 853 ; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 854 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 855 ; SKX-NEXT: retq 856 ; 857 ; SKX_32-LABEL: test18: 858 ; SKX_32: # BB#0: 859 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 860 ; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 861 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} 862 ; SKX_32-NEXT: retl 863 call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) 864 ret void 865 } 866 867 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) { 868 ; 869 ; KNL_64-LABEL: test19: 870 ; KNL_64: # BB#0: 871 ; KNL_64: vpslld $31, %xmm1, %xmm1 872 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 873 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 874 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 875 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 876 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 877 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 878 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} 879 ; KNL_64-NEXT: retq 880 ; 881 ; KNL_32-LABEL: test19: 882 ; KNL_32: # BB#0: 883 ; KNL_32: vpslld $31, %xmm1, %xmm1 884 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 885 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 886 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 887 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 888 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 889 ; KNL_32-NEXT: vpsllvq .LCPI18_0, %zmm1, %zmm1 890 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 891 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} 892 ; KNL_32-NEXT: retl 893 ; 894 ; SKX-LABEL: test19: 895 ; SKX: # BB#0: 896 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 897 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 898 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} 899 ; SKX-NEXT: retq 900 ; 901 ; SKX_32-LABEL: test19: 902 ; SKX_32: # BB#0: 903 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 904 ; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 905 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 906 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} 907 ; SKX_32-NEXT: retl 908 %gep = getelementptr double, double* %ptr, <4 x i64> %ind 909 call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) 910 ret void 911 } 912 913 ; Data type requires widening 914 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { 915 ; 916 ; KNL_64-LABEL: test20: 917 ; KNL_64: # BB#0: 918 ; KNL_64: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 919 ; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero 920 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 921 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 922 ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 923 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 924 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} 925 ; KNL_64-NEXT: retq 926 ; 927 ; KNL_32-LABEL: test20: 928 ; KNL_32: # BB#0: 929 ; KNL_32: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 930 ; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero 931 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 932 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 933 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 934 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 935 ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 936 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 937 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} 938 ; KNL_32-NEXT: retl 939 ; 940 ; SKX-LABEL: test20: 941 ; SKX: # BB#0: 942 ; SKX: vpsllq $63, %xmm2, %xmm2 943 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 944 ; SKX-NEXT: kshiftlb $6, %k0, %k0 945 ; SKX-NEXT: kshiftrb $6, %k0, %k1 946 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} 947 ; SKX-NEXT: retq 948 ; 949 ; SKX_32-LABEL: test20: 950 ; SKX_32: # BB#0: 951 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 952 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 953 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 954 ; SKX_32-NEXT: kshiftlb $6, %k0, %k0 955 ; SKX_32-NEXT: kshiftrb $6, %k0, %k1 956 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} 957 ; SKX_32-NEXT: retl 958 call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) 959 ret void 960 } 961 962 ; Data type requires promotion 963 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { 964 ; 965 ; KNL_64-LABEL: test21: 966 ; KNL_64: # BB#0: 967 ; KNL_64: vpxord %zmm3, %zmm3, %zmm3 968 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 969 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 970 ; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 971 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 972 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 973 ; KNL_64-NEXT: retq 974 ; 975 ; KNL_32-LABEL: test21: 976 ; KNL_32: # BB#0: 977 ; KNL_32: vpxord %zmm3, %zmm3, %zmm3 978 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 979 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 980 ; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2 981 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 982 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 983 ; KNL_32-NEXT: retl 984 ; 985 ; SKX-LABEL: test21: 986 ; SKX: # BB#0: 987 ; SKX: vpsllq $63, %xmm2, %xmm2 988 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 989 ; SKX-NEXT: kshiftlb $6, %k0, %k0 990 ; SKX-NEXT: kshiftrb $6, %k0, %k1 991 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 992 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 993 ; SKX-NEXT: retq 994 ; 995 ; SKX_32-LABEL: test21: 996 ; SKX_32: # BB#0: 997 ; SKX_32: vpsllq $63, %xmm2, %xmm2 998 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 999 ; SKX_32-NEXT: kshiftlb $6, %k0, %k0 1000 ; SKX_32-NEXT: kshiftrb $6, %k0, %k1 1001 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1002 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1003 ; SKX_32-NEXT: retl 1004 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) 1005 ret void 1006 } 1007 1008 ; The result type requires widening 1009 declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>) 1010 1011 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { 1012 ; 1013 ; 1014 ; KNL_64-LABEL: test22: 1015 ; KNL_64: # BB#0: 1016 ; KNL_64: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1017 ; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero 1018 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 1019 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 1020 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1021 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 1022 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1 1023 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1 1024 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} 1025 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1026 ; KNL_64-NEXT: retq 1027 ; 1028 ; KNL_32-LABEL: test22: 1029 ; KNL_32: # BB#0: 1030 ; KNL_32: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1031 ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero 1032 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 1033 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 1034 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1035 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1036 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 1037 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1 1038 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1039 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} 1040 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1041 ; KNL_32-NEXT: retl 1042 ; 1043 ; SKX-LABEL: test22: 1044 ; SKX: # BB#0: 1045 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1046 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1047 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0 1048 ; SKX-NEXT: kshiftlb $6, %k0, %k0 1049 ; SKX-NEXT: kshiftrb $6, %k0, %k1 1050 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} 1051 ; SKX-NEXT: vmovaps %zmm2, %zmm0 1052 ; SKX-NEXT: retq 1053 ; 1054 ; SKX_32-LABEL: test22: 1055 ; SKX_32: # BB#0: 1056 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1057 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1058 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0 1059 ; SKX_32-NEXT: kshiftlb $6, %k0, %k0 1060 ; SKX_32-NEXT: kshiftrb $6, %k0, %k1 1061 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1062 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1} 1063 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1064 ; SKX_32-NEXT: retl 1065 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1066 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind 1067 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) 1068 ret <2 x float>%res 1069 } 1070 1071 declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) 1072 declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) 1073 1074 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { 1075 ; 1076 ; KNL_64-LABEL: test23: 1077 ; KNL_64: # BB#0: 1078 ; KNL_64: vpxord %zmm3, %zmm3, %zmm3 1079 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1080 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 1081 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 1082 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} 1083 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1084 ; KNL_64-NEXT: retq 1085 ; 1086 ; KNL_32-LABEL: test23: 1087 ; KNL_32: # BB#0: 1088 ; KNL_32: vpxord %zmm3, %zmm3, %zmm3 1089 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1090 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1091 ; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1 1092 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1093 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} 1094 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1095 ; KNL_32-NEXT: retl 1096 ; 1097 ; SKX-LABEL: test23: 1098 ; SKX: # BB#0: 1099 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1100 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 1101 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} 1102 ; SKX-NEXT: vmovaps %zmm2, %zmm0 1103 ; SKX-NEXT: retq 1104 ; 1105 ; SKX_32-LABEL: test23: 1106 ; SKX_32: # BB#0: 1107 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1108 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 1109 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1110 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} 1111 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1112 ; SKX_32-NEXT: retl 1113 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1114 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind 1115 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) 1116 ret <2 x i32>%res 1117 } 1118 1119 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { 1120 ; KNL_64-LABEL: test24: 1121 ; KNL_64: # BB#0: 1122 ; KNL_64: movb $3, %al 1123 ; KNL_64-NEXT: kmovw %eax, %k1 1124 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} 1125 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1126 ; KNL_64-NEXT: retq 1127 ; 1128 ; KNL_32-LABEL: test24: 1129 ; KNL_32: # BB#0: 1130 ; KNL_32: movl {{[0-9]+}}(%esp), %eax 1131 ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1 1132 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1 1133 ; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1 1134 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1135 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} 1136 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1137 ; KNL_32-NEXT: retl 1138 ; 1139 ; SKX-LABEL: test24: 1140 ; SKX: # BB#0: 1141 ; SKX-NEXT: kxnorw %k0, %k0, %k1 1142 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} 1143 ; SKX-NEXT: vmovaps %zmm1, %zmm0 1144 ; SKX-NEXT: retq 1145 ; 1146 ; SKX_32-LABEL: test24: 1147 ; SKX_32: # BB#0: 1148 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1149 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 1150 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} 1151 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 1152 ; SKX_32-NEXT: retl 1153 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1154 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind 1155 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) 1156 ret <2 x i32>%res 1157 } 1158 1159 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) { 1160 ; 1161 ; KNL_64-LABEL: test25: 1162 ; KNL_64: # BB#0: 1163 ; KNL_64: vpxord %zmm3, %zmm3, %zmm3 1164 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1165 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 1166 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 1167 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} 1168 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1169 ; KNL_64-NEXT: retq 1170 ; 1171 ; KNL_32-LABEL: test25: 1172 ; KNL_32: # BB#0: 1173 ; KNL_32: vpxord %zmm3, %zmm3, %zmm3 1174 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1175 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1176 ; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1 1177 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1178 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} 1179 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1180 ; KNL_32-NEXT: retl 1181 ; 1182 ; SKX-LABEL: test25: 1183 ; SKX: # BB#0: 1184 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1185 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 1186 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} 1187 ; SKX-NEXT: vmovaps %zmm2, %zmm0 1188 ; SKX-NEXT: retq 1189 ; 1190 ; SKX_32-LABEL: test25: 1191 ; SKX_32: # BB#0: 1192 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1193 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 1194 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1195 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} 1196 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1197 ; SKX_32-NEXT: retl 1198 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1199 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind 1200 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) 1201 ret <2 x i64>%res 1202 } 1203 1204 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { 1205 ; 1206 ; KNL_64-LABEL: test26: 1207 ; KNL_64: # BB#0: 1208 ; KNL_64: movb $3, %al 1209 ; KNL_64-NEXT: kmovw %eax, %k1 1210 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} 1211 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1212 ; KNL_64-NEXT: retq 1213 ; 1214 ; KNL_32-LABEL: test26: 1215 ; KNL_32: # BB#0: 1216 ; KNL_32: movl {{[0-9]+}}(%esp), %eax 1217 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 1218 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2 1219 ; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2 1220 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 1221 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} 1222 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1223 ; KNL_32-NEXT: retl 1224 ; 1225 ; SKX-LABEL: test26: 1226 ; SKX: # BB#0: 1227 ; SKX-NEXT: kxnorw %k0, %k0, %k1 1228 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} 1229 ; SKX-NEXT: vmovaps %zmm1, %zmm0 1230 ; SKX-NEXT: retq 1231 ; 1232 ; SKX_32-LABEL: test26: 1233 ; SKX_32: # BB#0: 1234 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1235 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 1236 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} 1237 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 1238 ; SKX_32-NEXT: retl 1239 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1240 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind 1241 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0) 1242 ret <2 x i64>%res 1243 } 1244 1245 ; Result type requires widening; all-ones mask 1246 define <2 x float> @test27(float* %base, <2 x i32> %ind) { 1247 ; 1248 ; KNL_64-LABEL: test27: 1249 ; KNL_64: # BB#0: 1250 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1251 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 1252 ; KNL_64-NEXT: movb $3, %al 1253 ; KNL_64-NEXT: kmovw %eax, %k1 1254 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} 1255 ; KNL_64-NEXT: # kill 1256 ; KNL_64-NEXT: retq 1257 ; 1258 ; KNL_32-LABEL: test27: 1259 ; KNL_32: # BB#0: 1260 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1261 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1262 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 1263 ; KNL_32-NEXT: movb $3, %cl 1264 ; KNL_32-NEXT: kmovw %ecx, %k1 1265 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} 1266 ; KNL_32-NEXT: # kill 1267 ; KNL_32-NEXT: retl 1268 ; 1269 ; SKX-LABEL: test27: 1270 ; SKX: # BB#0: 1271 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 1272 ; SKX-NEXT: movb $3, %al 1273 ; SKX-NEXT: kmovb %eax, %k1 1274 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} 1275 ; SKX-NEXT: retq 1276 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1277 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind 1278 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef) 1279 ret <2 x float>%res 1280 } 1281 1282 ; Data type requires promotion, mask is all-ones 1283 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { 1284 ; 1285 ; 1286 ; KNL_64-LABEL: test28: 1287 ; KNL_64: # BB#0: 1288 ; KNL_64: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1289 ; KNL_64-NEXT: movb $3, %al 1290 ; KNL_64-NEXT: kmovw %eax, %k1 1291 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 1292 ; KNL_64-NEXT: retq 1293 ; 1294 ; KNL_32-LABEL: test28: 1295 ; KNL_32: # BB#0: 1296 ; KNL_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1297 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 1298 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2 1299 ; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2 1300 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 1301 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 1302 ; KNL_32-NEXT: retl 1303 ; 1304 ; SKX-LABEL: test28: 1305 ; SKX: # BB#0: 1306 ; SKX: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1307 ; SKX-NEXT: movb $3, %al 1308 ; SKX-NEXT: kmovb %eax, %k1 1309 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1310 ; SKX-NEXT: retq 1311 ; 1312 ; SKX_32-LABEL: test28: 1313 ; SKX_32: # BB#0: 1314 ; SKX_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1315 ; SKX_32-NEXT: movb $3, %al 1316 ; SKX_32-NEXT: kmovb %eax, %k1 1317 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1318 ; SKX_32-NEXT: retl 1319 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>) 1320 ret void 1321 } 1322 1323 1324 ; SCALAR-LABEL: test29 1325 ; SCALAR: extractelement <16 x float*> 1326 ; SCALAR-NEXT: load float 1327 ; SCALAR-NEXT: insertelement <16 x float> 1328 ; SCALAR-NEXT: extractelement <16 x float*> 1329 ; SCALAR-NEXT: load float 1330 1331 define <16 x float> @test29(float* %base, <16 x i32> %ind) { 1332 ; KNL_64-LABEL: test29: 1333 ; KNL_64: # BB#0: 1334 ; KNL_64-NEXT: movw $44, %ax 1335 ; KNL_64-NEXT: kmovw %eax, %k1 1336 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 1337 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1338 ; KNL_64-NEXT: retq 1339 ; 1340 ; KNL_32-LABEL: test29: 1341 ; KNL_32: # BB#0: 1342 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1343 ; KNL_32-NEXT: movw $44, %cx 1344 ; KNL_32-NEXT: kmovw %ecx, %k1 1345 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 1346 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1347 ; KNL_32-NEXT: retl 1348 ; 1349 ; SKX-LABEL: test29: 1350 ; SKX: # BB#0: 1351 ; SKX-NEXT: movw $44, %ax 1352 ; SKX-NEXT: kmovw %eax, %k1 1353 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 1354 ; SKX-NEXT: vmovaps %zmm1, %zmm0 1355 ; SKX-NEXT: retq 1356 1357 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 1358 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 1359 1360 %sext_ind = sext <16 x i32> %ind to <16 x i64> 1361 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 1362 1363 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef) 1364 ret <16 x float>%res 1365 } 1366 1367 ; Check non-power-of-2 case. It should be scalarized. 1368 declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) 1369 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { 1370 ; KNL_64-LABEL: test30: 1371 ; KNL_64: # BB#0: 1372 ; KNL_64-NEXT: andl $1, %edx 1373 ; KNL_64-NEXT: andl $1, %esi 1374 ; KNL_64-NEXT: movl %edi, %eax 1375 ; KNL_64-NEXT: andl $1, %eax 1376 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 1377 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 1378 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1379 ; KNL_64-NEXT: # implicit-def: %XMM0 1380 ; KNL_64-NEXT: testb $1, %dil 1381 ; KNL_64-NEXT: je .LBB29_2 1382 ; KNL_64-NEXT: # BB#1: # %cond.load 1383 ; KNL_64-NEXT: vmovq %xmm1, %rcx 1384 ; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1385 ; KNL_64-NEXT: .LBB29_2: # %else 1386 ; KNL_64-NEXT: testb %sil, %sil 1387 ; KNL_64-NEXT: je .LBB29_4 1388 ; KNL_64-NEXT: # BB#3: # %cond.load1 1389 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx 1390 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 1391 ; KNL_64-NEXT: .LBB29_4: # %else2 1392 ; KNL_64-NEXT: testb %dl, %dl 1393 ; KNL_64-NEXT: je .LBB29_6 1394 ; KNL_64-NEXT: # BB#5: # %cond.load4 1395 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 1396 ; KNL_64-NEXT: vmovq %xmm1, %rcx 1397 ; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0 1398 ; KNL_64-NEXT: .LBB29_6: # %else5 1399 ; KNL_64-NEXT: vmovd %eax, %xmm1 1400 ; KNL_64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 1401 ; KNL_64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 1402 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 1403 ; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 1404 ; KNL_64-NEXT: retq 1405 ; 1406 ; KNL_32-LABEL: test30: 1407 ; KNL_32: # BB#0: 1408 ; KNL_32-NEXT: pushl %ebx 1409 ; KNL_32-NEXT: .Ltmp0: 1410 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1411 ; KNL_32-NEXT: pushl %esi 1412 ; KNL_32-NEXT: .Ltmp1: 1413 ; KNL_32-NEXT: .cfi_def_cfa_offset 12 1414 ; KNL_32-NEXT: .Ltmp2: 1415 ; KNL_32-NEXT: .cfi_offset %esi, -12 1416 ; KNL_32-NEXT: .Ltmp3: 1417 ; KNL_32-NEXT: .cfi_offset %ebx, -8 1418 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1419 ; KNL_32-NEXT: andl $1, %eax 1420 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx 1421 ; KNL_32-NEXT: andl $1, %ecx 1422 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ebx 1423 ; KNL_32-NEXT: movl %ebx, %edx 1424 ; KNL_32-NEXT: andl $1, %edx 1425 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 1426 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1427 ; KNL_32-NEXT: # implicit-def: %XMM0 1428 ; KNL_32-NEXT: testb $1, %bl 1429 ; KNL_32-NEXT: je .LBB29_2 1430 ; KNL_32-NEXT: # BB#1: # %cond.load 1431 ; KNL_32-NEXT: vmovd %xmm1, %esi 1432 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1433 ; KNL_32-NEXT: .LBB29_2: # %else 1434 ; KNL_32-NEXT: testb %cl, %cl 1435 ; KNL_32-NEXT: je .LBB29_4 1436 ; KNL_32-NEXT: # BB#3: # %cond.load1 1437 ; KNL_32-NEXT: vpextrd $1, %xmm1, %esi 1438 ; KNL_32-NEXT: vpinsrd $1, (%esi), %xmm0, %xmm0 1439 ; KNL_32-NEXT: .LBB29_4: # %else2 1440 ; KNL_32-NEXT: testb %al, %al 1441 ; KNL_32-NEXT: je .LBB29_6 1442 ; KNL_32-NEXT: # BB#5: # %cond.load4 1443 ; KNL_32-NEXT: vpextrd $2, %xmm1, %esi 1444 ; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 1445 ; KNL_32-NEXT: .LBB29_6: # %else5 1446 ; KNL_32-NEXT: vmovd %edx, %xmm1 1447 ; KNL_32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 1448 ; KNL_32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 1449 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 1450 ; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 1451 ; KNL_32-NEXT: popl %esi 1452 ; KNL_32-NEXT: popl %ebx 1453 ; KNL_32-NEXT: retl 1454 ; 1455 ; SKX-LABEL: test30: 1456 ; SKX: # BB#0: 1457 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2 1458 ; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 1459 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1460 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 1461 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 1462 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1463 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1464 ; SKX-NEXT: # implicit-def: %XMM0 1465 ; SKX-NEXT: testb %al, %al 1466 ; SKX-NEXT: je .LBB29_2 1467 ; SKX-NEXT: # BB#1: # %cond.load 1468 ; SKX-NEXT: vmovq %xmm1, %rax 1469 ; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1470 ; SKX-NEXT: .LBB29_2: # %else 1471 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1472 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1473 ; SKX-NEXT: testb %al, %al 1474 ; SKX-NEXT: je .LBB29_4 1475 ; SKX-NEXT: # BB#3: # %cond.load1 1476 ; SKX-NEXT: vpextrq $1, %xmm1, %rax 1477 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 1478 ; SKX-NEXT: .LBB29_4: # %else2 1479 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1480 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1481 ; SKX-NEXT: testb %al, %al 1482 ; SKX-NEXT: je .LBB29_6 1483 ; SKX-NEXT: # BB#5: # %cond.load4 1484 ; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm1 1485 ; SKX-NEXT: vmovq %xmm1, %rax 1486 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0 1487 ; SKX-NEXT: .LBB29_6: # %else5 1488 ; SKX-NEXT: vpblendmd %xmm0, %xmm3, %xmm0 {%k1} 1489 ; SKX-NEXT: retq 1490 ; 1491 ; SKX_32-LABEL: test30: 1492 ; SKX_32: # BB#0: 1493 ; SKX_32-NEXT: subl $12, %esp 1494 ; SKX_32-NEXT: .Ltmp0: 1495 ; SKX_32-NEXT: .cfi_def_cfa_offset 16 1496 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 1497 ; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 1498 ; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) 1499 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 1500 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1501 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al 1502 ; SKX_32-NEXT: # implicit-def: %XMM0 1503 ; SKX_32-NEXT: testb %al, %al 1504 ; SKX_32-NEXT: je .LBB29_2 1505 ; SKX_32-NEXT: # BB#1: # %cond.load 1506 ; SKX_32-NEXT: vmovd %xmm1, %eax 1507 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1508 ; SKX_32-NEXT: .LBB29_2: # %else 1509 ; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) 1510 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al 1511 ; SKX_32-NEXT: testb %al, %al 1512 ; SKX_32-NEXT: je .LBB29_4 1513 ; SKX_32-NEXT: # BB#3: # %cond.load1 1514 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax 1515 ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0 1516 ; SKX_32-NEXT: .LBB29_4: # %else2 1517 ; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm2 1518 ; SKX_32-NEXT: kmovb %k1, (%esp) 1519 ; SKX_32-NEXT: movb (%esp), %al 1520 ; SKX_32-NEXT: testb %al, %al 1521 ; SKX_32-NEXT: je .LBB29_6 1522 ; SKX_32-NEXT: # BB#5: # %cond.load4 1523 ; SKX_32-NEXT: vpextrd $2, %xmm1, %eax 1524 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 1525 ; SKX_32-NEXT: .LBB29_6: # %else5 1526 ; SKX_32-NEXT: vpblendmd %xmm0, %xmm2, %xmm0 {%k1} 1527 ; SKX_32-NEXT: addl $12, %esp 1528 ; SKX_32-NEXT: retl 1529 1530 %sext_ind = sext <3 x i32> %ind to <3 x i64> 1531 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind 1532 %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) 1533 ret <3 x i32>%res 1534 } 1535 1536 declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>) 1537 1538 ; KNL-LABEL: test31 1539 ; KNL: vpgatherqq 1540 ; KNL: vpgatherqq 1541 define <16 x float*> @test31(<16 x float**> %ptrs) { 1542 ; KNL_64-LABEL: test31: 1543 ; KNL_64: # BB#0: 1544 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 1545 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2 1546 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} 1547 ; KNL_64-NEXT: kshiftrw $8, %k1, %k1 1548 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} 1549 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1550 ; KNL_64-NEXT: vmovaps %zmm3, %zmm1 1551 ; KNL_64-NEXT: retq 1552 ; 1553 ; KNL_32-LABEL: test31: 1554 ; KNL_32: # BB#0: 1555 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 1556 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 1557 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1558 ; KNL_32-NEXT: retl 1559 ; 1560 ; SKX-LABEL: test31: 1561 ; SKX: # BB#0: 1562 ; SKX-NEXT: kxnorw %k0, %k0, %k1 1563 ; SKX-NEXT: kxnorw %k0, %k0, %k2 1564 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} 1565 ; SKX-NEXT: kshiftrw $8, %k1, %k1 1566 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} 1567 ; SKX-NEXT: vmovaps %zmm2, %zmm0 1568 ; SKX-NEXT: vmovaps %zmm3, %zmm1 1569 ; SKX-NEXT: retq 1570 ; 1571 ; SKX_32-LABEL: test31: 1572 ; SKX_32: # BB#0: 1573 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 1574 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 1575 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 1576 ; SKX_32-NEXT: retl 1577 1578 %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef) 1579 ret <16 x float*>%res 1580 } 1581 1582 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 1583 ; KNL_64-LABEL: test_gather_16i32: 1584 ; KNL_64: # BB#0: 1585 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1586 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1587 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1588 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2 1589 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1590 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 1591 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 1592 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 1593 ; KNL_64-NEXT: retq 1594 ; 1595 ; KNL_32-LABEL: test_gather_16i32: 1596 ; KNL_32: # BB#0: 1597 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1598 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1599 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1600 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 1601 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1602 ; KNL_32-NEXT: retl 1603 ; 1604 ; SKX-LABEL: test_gather_16i32: 1605 ; SKX: # BB#0: 1606 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1607 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1608 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1609 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2 1610 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1611 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 1612 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 1613 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 1614 ; SKX-NEXT: retq 1615 ; 1616 ; SKX_32-LABEL: test_gather_16i32: 1617 ; SKX_32: # BB#0: 1618 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1619 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1620 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1621 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 1622 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1623 ; SKX_32-NEXT: retl 1624 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) 1625 ret <16 x i32> %res 1626 } 1627 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 1628 ; KNL_64-LABEL: test_gather_16i64: 1629 ; KNL_64: # BB#0: 1630 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1631 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1632 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1633 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1634 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 1635 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 1636 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0 1637 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1 1638 ; KNL_64-NEXT: retq 1639 ; 1640 ; KNL_32-LABEL: test_gather_16i64: 1641 ; KNL_32: # BB#0: 1642 ; KNL_32-NEXT: pushl %ebp 1643 ; KNL_32-NEXT: .Ltmp4: 1644 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1645 ; KNL_32-NEXT: .Ltmp5: 1646 ; KNL_32-NEXT: .cfi_offset %ebp, -8 1647 ; KNL_32-NEXT: movl %esp, %ebp 1648 ; KNL_32-NEXT: .Ltmp6: 1649 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1650 ; KNL_32-NEXT: andl $-64, %esp 1651 ; KNL_32-NEXT: subl $64, %esp 1652 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1653 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1654 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1655 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1656 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1657 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} 1658 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1659 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} 1660 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1661 ; KNL_32-NEXT: movl %ebp, %esp 1662 ; KNL_32-NEXT: popl %ebp 1663 ; KNL_32-NEXT: retl 1664 ; 1665 ; SKX-LABEL: test_gather_16i64: 1666 ; SKX: # BB#0: 1667 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1668 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1669 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1670 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1671 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 1672 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 1673 ; SKX-NEXT: vmovaps %zmm3, %zmm0 1674 ; SKX-NEXT: vmovaps %zmm4, %zmm1 1675 ; SKX-NEXT: retq 1676 ; 1677 ; SKX_32-LABEL: test_gather_16i64: 1678 ; SKX_32: # BB#0: 1679 ; SKX_32-NEXT: pushl %ebp 1680 ; SKX_32-NEXT: .Ltmp1: 1681 ; SKX_32-NEXT: .cfi_def_cfa_offset 8 1682 ; SKX_32-NEXT: .Ltmp2: 1683 ; SKX_32-NEXT: .cfi_offset %ebp, -8 1684 ; SKX_32-NEXT: movl %esp, %ebp 1685 ; SKX_32-NEXT: .Ltmp3: 1686 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp 1687 ; SKX_32-NEXT: andl $-64, %esp 1688 ; SKX_32-NEXT: subl $64, %esp 1689 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1690 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1691 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1692 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1693 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2 1694 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} 1695 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 1696 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} 1697 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1698 ; SKX_32-NEXT: movl %ebp, %esp 1699 ; SKX_32-NEXT: popl %ebp 1700 ; SKX_32-NEXT: retl 1701 %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) 1702 ret <16 x i64> %res 1703 } 1704 declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) 1705 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 1706 ; KNL_64-LABEL: test_gather_16f32: 1707 ; KNL_64: # BB#0: 1708 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1709 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1710 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1711 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2 1712 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1713 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 1714 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 1715 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0 1716 ; KNL_64-NEXT: retq 1717 ; 1718 ; KNL_32-LABEL: test_gather_16f32: 1719 ; KNL_32: # BB#0: 1720 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1721 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1722 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1723 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} 1724 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1725 ; KNL_32-NEXT: retl 1726 ; 1727 ; SKX-LABEL: test_gather_16f32: 1728 ; SKX: # BB#0: 1729 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1730 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1731 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1732 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2 1733 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1734 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 1735 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 1736 ; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0 1737 ; SKX-NEXT: retq 1738 ; 1739 ; SKX_32-LABEL: test_gather_16f32: 1740 ; SKX_32: # BB#0: 1741 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1742 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1743 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1744 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} 1745 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1746 ; SKX_32-NEXT: retl 1747 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) 1748 ret <16 x float> %res 1749 } 1750 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 1751 ; KNL_64-LABEL: test_gather_16f64: 1752 ; KNL_64: # BB#0: 1753 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1754 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1755 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1756 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1757 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 1758 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 1759 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0 1760 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1 1761 ; KNL_64-NEXT: retq 1762 ; 1763 ; KNL_32-LABEL: test_gather_16f64: 1764 ; KNL_32: # BB#0: 1765 ; KNL_32-NEXT: pushl %ebp 1766 ; KNL_32-NEXT: .Ltmp7: 1767 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1768 ; KNL_32-NEXT: .Ltmp8: 1769 ; KNL_32-NEXT: .cfi_offset %ebp, -8 1770 ; KNL_32-NEXT: movl %esp, %ebp 1771 ; KNL_32-NEXT: .Ltmp9: 1772 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1773 ; KNL_32-NEXT: andl $-64, %esp 1774 ; KNL_32-NEXT: subl $64, %esp 1775 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1776 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1777 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1778 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 1779 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1780 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} 1781 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1782 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} 1783 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1784 ; KNL_32-NEXT: movl %ebp, %esp 1785 ; KNL_32-NEXT: popl %ebp 1786 ; KNL_32-NEXT: retl 1787 ; 1788 ; SKX-LABEL: test_gather_16f64: 1789 ; SKX: # BB#0: 1790 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1791 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1792 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1793 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1794 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 1795 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 1796 ; SKX-NEXT: vmovaps %zmm3, %zmm0 1797 ; SKX-NEXT: vmovaps %zmm4, %zmm1 1798 ; SKX-NEXT: retq 1799 ; 1800 ; SKX_32-LABEL: test_gather_16f64: 1801 ; SKX_32: # BB#0: 1802 ; SKX_32-NEXT: pushl %ebp 1803 ; SKX_32-NEXT: .Ltmp4: 1804 ; SKX_32-NEXT: .cfi_def_cfa_offset 8 1805 ; SKX_32-NEXT: .Ltmp5: 1806 ; SKX_32-NEXT: .cfi_offset %ebp, -8 1807 ; SKX_32-NEXT: movl %esp, %ebp 1808 ; SKX_32-NEXT: .Ltmp6: 1809 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp 1810 ; SKX_32-NEXT: andl $-64, %esp 1811 ; SKX_32-NEXT: subl $64, %esp 1812 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1813 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1814 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1815 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 1816 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2 1817 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} 1818 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 1819 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} 1820 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1821 ; SKX_32-NEXT: movl %ebp, %esp 1822 ; SKX_32-NEXT: popl %ebp 1823 ; SKX_32-NEXT: retl 1824 %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) 1825 ret <16 x double> %res 1826 } 1827 declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) 1828 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 1829 ; KNL_64-LABEL: test_scatter_16i32: 1830 ; KNL_64: # BB#0: 1831 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1832 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1833 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1834 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1835 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 1836 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1837 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 1838 ; KNL_64-NEXT: retq 1839 ; 1840 ; KNL_32-LABEL: test_scatter_16i32: 1841 ; KNL_32: # BB#0: 1842 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1843 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1844 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1845 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 1846 ; KNL_32-NEXT: retl 1847 ; 1848 ; SKX-LABEL: test_scatter_16i32: 1849 ; SKX: # BB#0: 1850 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1851 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1852 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1853 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1854 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 1855 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0 1856 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 1857 ; SKX-NEXT: retq 1858 ; 1859 ; SKX_32-LABEL: test_scatter_16i32: 1860 ; SKX_32: # BB#0: 1861 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1862 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1863 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1864 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 1865 ; SKX_32-NEXT: retl 1866 call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask) 1867 ret void 1868 } 1869 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 1870 ; KNL_64-LABEL: test_scatter_16i64: 1871 ; KNL_64: # BB#0: 1872 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1873 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1874 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1875 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1876 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 1877 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 1878 ; KNL_64-NEXT: retq 1879 ; 1880 ; KNL_32-LABEL: test_scatter_16i64: 1881 ; KNL_32: # BB#0: 1882 ; KNL_32-NEXT: pushl %ebp 1883 ; KNL_32-NEXT: .Ltmp10: 1884 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1885 ; KNL_32-NEXT: .Ltmp11: 1886 ; KNL_32-NEXT: .cfi_offset %ebp, -8 1887 ; KNL_32-NEXT: movl %esp, %ebp 1888 ; KNL_32-NEXT: .Ltmp12: 1889 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1890 ; KNL_32-NEXT: andl $-64, %esp 1891 ; KNL_32-NEXT: subl $64, %esp 1892 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1893 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1894 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1895 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1896 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1897 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} 1898 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1899 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} 1900 ; KNL_32-NEXT: movl %ebp, %esp 1901 ; KNL_32-NEXT: popl %ebp 1902 ; KNL_32-NEXT: retl 1903 ; 1904 ; SKX-LABEL: test_scatter_16i64: 1905 ; SKX: # BB#0: 1906 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1907 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1908 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1909 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1910 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 1911 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 1912 ; SKX-NEXT: retq 1913 ; 1914 ; SKX_32-LABEL: test_scatter_16i64: 1915 ; SKX_32: # BB#0: 1916 ; SKX_32-NEXT: pushl %ebp 1917 ; SKX_32-NEXT: .Ltmp7: 1918 ; SKX_32-NEXT: .cfi_def_cfa_offset 8 1919 ; SKX_32-NEXT: .Ltmp8: 1920 ; SKX_32-NEXT: .cfi_offset %ebp, -8 1921 ; SKX_32-NEXT: movl %esp, %ebp 1922 ; SKX_32-NEXT: .Ltmp9: 1923 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp 1924 ; SKX_32-NEXT: andl $-64, %esp 1925 ; SKX_32-NEXT: subl $64, %esp 1926 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1927 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1928 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1929 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1930 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2 1931 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} 1932 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 1933 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} 1934 ; SKX_32-NEXT: movl %ebp, %esp 1935 ; SKX_32-NEXT: popl %ebp 1936 ; SKX_32-NEXT: retl 1937 call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) 1938 ret void 1939 } 1940 declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask) 1941 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 1942 ; KNL_64-LABEL: test_scatter_16f32: 1943 ; KNL_64: # BB#0: 1944 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1945 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1946 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1947 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1948 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 1949 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0 1950 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 1951 ; KNL_64-NEXT: retq 1952 ; 1953 ; KNL_32-LABEL: test_scatter_16f32: 1954 ; KNL_32: # BB#0: 1955 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1956 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1957 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1958 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} 1959 ; KNL_32-NEXT: retl 1960 ; 1961 ; SKX-LABEL: test_scatter_16f32: 1962 ; SKX: # BB#0: 1963 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1964 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1965 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1966 ; SKX-NEXT: kshiftrw $8, %k1, %k2 1967 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 1968 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0 1969 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 1970 ; SKX-NEXT: retq 1971 ; 1972 ; SKX_32-LABEL: test_scatter_16f32: 1973 ; SKX_32: # BB#0: 1974 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1975 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1976 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1977 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} 1978 ; SKX_32-NEXT: retl 1979 call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) 1980 ret void 1981 } 1982 declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask) 1983 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 1984 ; KNL_64-LABEL: test_scatter_16f64: 1985 ; KNL_64: # BB#0: 1986 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1987 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1988 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1989 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1990 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 1991 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 1992 ; KNL_64-NEXT: retq 1993 ; 1994 ; KNL_32-LABEL: test_scatter_16f64: 1995 ; KNL_32: # BB#0: 1996 ; KNL_32-NEXT: pushl %ebp 1997 ; KNL_32-NEXT: .Ltmp13: 1998 ; KNL_32-NEXT: .cfi_def_cfa_offset 8 1999 ; KNL_32-NEXT: .Ltmp14: 2000 ; KNL_32-NEXT: .cfi_offset %ebp, -8 2001 ; KNL_32-NEXT: movl %esp, %ebp 2002 ; KNL_32-NEXT: .Ltmp15: 2003 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp 2004 ; KNL_32-NEXT: andl $-64, %esp 2005 ; KNL_32-NEXT: subl $64, %esp 2006 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 2007 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 2008 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 2009 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 2010 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 2011 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} 2012 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2013 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} 2014 ; KNL_32-NEXT: movl %ebp, %esp 2015 ; KNL_32-NEXT: popl %ebp 2016 ; KNL_32-NEXT: retl 2017 ; 2018 ; SKX-LABEL: test_scatter_16f64: 2019 ; SKX: # BB#0: 2020 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 2021 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 2022 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 2023 ; SKX-NEXT: kshiftrw $8, %k1, %k2 2024 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 2025 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 2026 ; SKX-NEXT: retq 2027 ; 2028 ; SKX_32-LABEL: test_scatter_16f64: 2029 ; SKX_32: # BB#0: 2030 ; SKX_32-NEXT: pushl %ebp 2031 ; SKX_32-NEXT: .Ltmp10: 2032 ; SKX_32-NEXT: .cfi_def_cfa_offset 8 2033 ; SKX_32-NEXT: .Ltmp11: 2034 ; SKX_32-NEXT: .cfi_offset %ebp, -8 2035 ; SKX_32-NEXT: movl %esp, %ebp 2036 ; SKX_32-NEXT: .Ltmp12: 2037 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp 2038 ; SKX_32-NEXT: andl $-64, %esp 2039 ; SKX_32-NEXT: subl $64, %esp 2040 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 2041 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 2042 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 2043 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 2044 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2 2045 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} 2046 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 2047 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} 2048 ; SKX_32-NEXT: movl %ebp, %esp 2049 ; SKX_32-NEXT: popl %ebp 2050 ; SKX_32-NEXT: retl 2051 call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) 2052 ret void 2053 } 2054 declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask) 2055