Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
      2 ; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
      3 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
      4 ; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
      5 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
      6 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
      7 
      8 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      9 target triple = "x86_64-unknown-linux-gnu"
     10 
     11 
     12 ; SCALAR-LABEL: test1
     13 ; SCALAR:      extractelement <16 x float*>
     14 ; SCALAR-NEXT: load float
     15 ; SCALAR-NEXT: insertelement <16 x float>
     16 ; SCALAR-NEXT: extractelement <16 x float*>
     17 ; SCALAR-NEXT: load float
     18 
     19 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
     20 ; KNL_64-LABEL: test1:
     21 ; KNL_64:       # BB#0:
     22 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
     23 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
     24 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
     25 ; KNL_64-NEXT:    retq
     26 ;
     27 ; KNL_32-LABEL: test1:
     28 ; KNL_32:       # BB#0:
     29 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     30 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
     31 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
     32 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
     33 ; KNL_32-NEXT:    retl
     34 ;
     35 ; SKX-LABEL: test1:
     36 ; SKX:       # BB#0:
     37 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
     38 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
     39 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
     40 ; SKX-NEXT:    retq
     41 
     42   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
     43   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
     44 
     45   %sext_ind = sext <16 x i32> %ind to <16 x i64>
     46   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
     47 
     48   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
     49   ret <16 x float>%res
     50 }
     51 
     52 declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
     53 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
     54 declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
     55 
     56 
     57 ; SCALAR-LABEL: test2
     58 ; SCALAR:      extractelement <16 x float*>
     59 ; SCALAR-NEXT: load float
     60 ; SCALAR-NEXT: insertelement <16 x float>
     61 ; SCALAR-NEXT: br label %else
     62 ; SCALAR: else:
     63 ; SCALAR-NEXT:  %res.phi.else = phi
     64 ; SCALAR-NEXT:  %Mask1 = extractelement <16 x i1> %imask, i32 1
     65 ; SCALAR-NEXT:  %ToLoad1 = icmp eq i1 %Mask1, true
     66 ; SCALAR-NEXT:  br i1 %ToLoad1, label %cond.load1, label %else2
     67 
     68 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
     69 ; KNL_64-LABEL: test2:
     70 ; KNL_64:       # BB#0:
     71 ; KNL_64-NEXT:    kmovw %esi, %k1
     72 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
     73 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
     74 ; KNL_64-NEXT:    retq
     75 ;
     76 ; KNL_32-LABEL: test2:
     77 ; KNL_32:       # BB#0:
     78 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     79 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
     80 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
     81 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
     82 ; KNL_32-NEXT:    retl
     83 ;
     84 ; SKX-LABEL: test2:
     85 ; SKX:       # BB#0:
     86 ; SKX-NEXT:    kmovw %esi, %k1
     87 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
     88 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
     89 ; SKX-NEXT:    retq
     90 
     91   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
     92   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
     93 
     94   %sext_ind = sext <16 x i32> %ind to <16 x i64>
     95   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
     96   %imask = bitcast i16 %mask to <16 x i1>
     97   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
     98   ret <16 x float> %res
     99 }
    100 
    101 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
    102 ; KNL_64-LABEL: test3:
    103 ; KNL_64:       # BB#0:
    104 ; KNL_64-NEXT:    kmovw %esi, %k1
    105 ; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
    106 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
    107 ; KNL_64-NEXT:    retq
    108 ;
    109 ; KNL_32-LABEL: test3:
    110 ; KNL_32:       # BB#0:
    111 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    112 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    113 ; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
    114 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
    115 ; KNL_32-NEXT:    retl
    116 ;
    117 ; SKX-LABEL: test3:
    118 ; SKX:       # BB#0:
    119 ; SKX-NEXT:    kmovw %esi, %k1
    120 ; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
    121 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    122 ; SKX-NEXT:    retq
    123 
    124   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
    125   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
    126 
    127   %sext_ind = sext <16 x i32> %ind to <16 x i64>
    128   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
    129   %imask = bitcast i16 %mask to <16 x i1>
    130   %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
    131   ret <16 x i32> %res
    132 }
    133 
    134 
    135 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
    136 ; KNL_64-LABEL: test4:
    137 ; KNL_64:       # BB#0:
    138 ; KNL_64-NEXT:    kmovw %esi, %k1
    139 ; KNL_64-NEXT:    kmovw %k1, %k2
    140 ; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
    141 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
    142 ; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
    143 ; KNL_64-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    144 ; KNL_64-NEXT:    retq
    145 ;
    146 ; KNL_32-LABEL: test4:
    147 ; KNL_32:       # BB#0:
    148 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    149 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    150 ; KNL_32-NEXT:    kmovw %k1, %k2
    151 ; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
    152 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
    153 ; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
    154 ; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    155 ; KNL_32-NEXT:    retl
    156 ;
    157 ; SKX-LABEL: test4:
    158 ; SKX:       # BB#0:
    159 ; SKX-NEXT:    kmovw %esi, %k1
    160 ; SKX-NEXT:    kmovw %k1, %k2
    161 ; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
    162 ; SKX-NEXT:    vmovaps %zmm1, %zmm2
    163 ; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
    164 ; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    165 ; SKX-NEXT:    retq
    166 
    167   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
    168   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
    169 
    170   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
    171   %imask = bitcast i16 %mask to <16 x i1>
    172   %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
    173   %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
    174   %res = add <16 x i32> %gt1, %gt2
    175   ret <16 x i32> %res
    176 }
    177 
    178 
    179 ; SCALAR-LABEL: test5
    180 ; SCALAR:        %Mask0 = extractelement <16 x i1> %imask, i32 0
    181 ; SCALAR-NEXT:   %ToStore0 = icmp eq i1 %Mask0, true
    182 ; SCALAR-NEXT:   br i1 %ToStore0, label %cond.store, label %else
    183 ; SCALAR: cond.store:
    184 ; SCALAR-NEXT:  %Elt0 = extractelement <16 x i32> %val, i32 0
    185 ; SCALAR-NEXT:  %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
    186 ; SCALAR-NEXT:  store i32 %Elt0, i32* %Ptr0, align 4
    187 ; SCALAR-NEXT:  br label %else
    188 ; SCALAR: else:
    189 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
    190 ; SCALAR-NEXT:  %ToStore1 = icmp eq i1 %Mask1, true
    191 ; SCALAR-NEXT:  br i1 %ToStore1, label %cond.store1, label %else2
    192 
    193 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
    194 ; KNL_64-LABEL: test5:
    195 ; KNL_64:       # BB#0:
    196 ; KNL_64-NEXT:    kmovw %esi, %k1
    197 ; KNL_64-NEXT:    kmovw %k1, %k2
    198 ; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
    199 ; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
    200 ; KNL_64-NEXT:    retq
    201 ;
    202 ; KNL_32-LABEL: test5:
    203 ; KNL_32:       # BB#0:
    204 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    205 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    206 ; KNL_32-NEXT:    kmovw %k1, %k2
    207 ; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
    208 ; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
    209 ; KNL_32-NEXT:    retl
    210 ;
    211 ; SKX-LABEL: test5:
    212 ; SKX:       # BB#0:
    213 ; SKX-NEXT:    kmovw %esi, %k1
    214 ; SKX-NEXT:    kmovw %k1, %k2
    215 ; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
    216 ; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
    217 ; SKX-NEXT:    retq
    218 
    219   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
    220   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
    221 
    222   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
    223   %imask = bitcast i16 %mask to <16 x i1>
    224   call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
    225   call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
    226   ret void
    227 }
    228 
    229 declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
    230 declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
    231 
    232 
    233 ; SCALAR-LABEL: test6
    234 ; SCALAR:        store i32 %Elt0, i32* %Ptr01, align 4
    235 ; SCALAR-NEXT:   %Elt1 = extractelement <8 x i32> %a1, i32 1
    236 ; SCALAR-NEXT:   %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
    237 ; SCALAR-NEXT:   store i32 %Elt1, i32* %Ptr12, align 4
    238 ; SCALAR-NEXT:   %Elt2 = extractelement <8 x i32> %a1, i32 2
    239 ; SCALAR-NEXT:   %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
    240 ; SCALAR-NEXT:   store i32 %Elt2, i32* %Ptr23, align 4
    241 
    242 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
    243 ; KNL_64-LABEL: test6:
    244 ; KNL_64:       # BB#0:
    245 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
    246 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
    247 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
    248 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    249 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
    250 ; KNL_64-NEXT:    retq
    251 ;
    252 ; KNL_32-LABEL: test6:
    253 ; KNL_32:       # BB#0:
    254 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
    255 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm2
    256 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k2
    257 ; KNL_32-NEXT:    vpgatherqd (,%zmm2), %ymm1 {%k2}
    258 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm2) {%k1}
    259 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
    260 ; KNL_32-NEXT:    retl
    261 ;
    262 ; SKX-LABEL: test6:
    263 ; SKX:       # BB#0:
    264 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
    265 ; SKX-NEXT:    kxnorw %k0, %k0, %k2
    266 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
    267 ; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    268 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
    269 ; SKX-NEXT:    retq
    270 
    271   %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
    272 
    273   call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
    274   ret <8 x i32>%a
    275 }
    276 
    277 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
    278 ;
    279 ; KNL_64-LABEL: test7:
    280 ; KNL_64:       # BB#0:
    281 ; KNL_64-NEXT:    kmovw %esi, %k1
    282 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
    283 ; KNL_64-NEXT:    kmovw %k1, %k2
    284 ; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
    285 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
    286 ; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
    287 ; KNL_64-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
    288 ; KNL_64-NEXT:    retq
    289 ;
    290 ; KNL_32-LABEL: test7:
    291 ; KNL_32:       # BB#0:
    292 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    293 ; KNL_32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
    294 ; KNL_32-NEXT:    kmovw %ecx, %k1
    295 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
    296 ; KNL_32-NEXT:    kmovw %k1, %k2
    297 ; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
    298 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
    299 ; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
    300 ; KNL_32-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
    301 ; KNL_32-NEXT:    retl
    302 ;
    303 ; SKX-LABEL: test7:
    304 ; SKX:       # BB#0:
    305 ; SKX-NEXT:    kmovb %esi, %k1
    306 ; SKX-NEXT:    kmovw %k1, %k2
    307 ; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
    308 ; SKX-NEXT:    vmovaps %zmm1, %zmm2
    309 ; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
    310 ; SKX-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
    311 ; SKX-NEXT:    retq
    312 
    313   %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
    314   %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
    315 
    316   %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
    317   %imask = bitcast i8 %mask to <8 x i1>
    318   %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
    319   %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
    320   %res = add <8 x i32> %gt1, %gt2
    321   ret <8 x i32> %res
    322 }
    323 
    324 ; No uniform base in this case, index <8 x i64> contains addresses,
    325 ; each gather call will be split into two
    326 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
    327 ; KNL_64-LABEL: test8:
    328 ; KNL_64:       # BB#0:
    329 ; KNL_64-NEXT:    kmovw %edi, %k1
    330 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
    331 ; KNL_64-NEXT:    kmovw %k2, %k3
    332 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
    333 ; KNL_64-NEXT:    kmovw %k1, %k3
    334 ; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
    335 ; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm4
    336 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
    337 ; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
    338 ; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
    339 ; KNL_64-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
    340 ; KNL_64-NEXT:    retq
    341 ;
    342 ; KNL_32-LABEL: test8:
    343 ; KNL_32:       # BB#0:
    344 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    345 ; KNL_32-NEXT:    kmovw %k1, %k2
    346 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
    347 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
    348 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
    349 ; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    350 ; KNL_32-NEXT:    retl
    351 ;
    352 ; SKX-LABEL: test8:
    353 ; SKX:       # BB#0:
    354 ; SKX-NEXT:    kmovw %edi, %k1
    355 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
    356 ; SKX-NEXT:    kmovw %k2, %k3
    357 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
    358 ; SKX-NEXT:    kmovw %k1, %k3
    359 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
    360 ; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm4
    361 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
    362 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
    363 ; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
    364 ; SKX-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
    365 ; SKX-NEXT:    retq
    366 ;
    367 ; SKX_32-LABEL: test8:
    368 ; SKX_32:       # BB#0:
    369 ; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    370 ; SKX_32-NEXT:    kmovw %k1, %k2
    371 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
    372 ; SKX_32-NEXT:    vmovaps %zmm1, %zmm2
    373 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
    374 ; SKX_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    375 ; SKX_32-NEXT:    retl
    376 
    377   %imask = bitcast i16 %mask to <16 x i1>
    378   %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
    379   %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
    380   %res = add <16 x i32> %gt1, %gt2
    381   ret <16 x i32> %res
    382 }
    383 
    384 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
    385 %struct.ST = type { i32, double, %struct.RT }
    386 
    387 ; Masked gather for agregate types
    388 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
    389 
    390 
    391 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
    392 ; KNL_64-LABEL: test9:
    393 ; KNL_64:       # BB#0: # %entry
    394 ; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
    395 ; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
    396 ; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
    397 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
    398 ; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
    399 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
    400 ; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
    401 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
    402 ; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
    403 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
    404 ; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
    405 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
    406 ; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
    407 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
    408 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    409 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    410 ; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
    411 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
    412 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    413 ; KNL_64-NEXT:    retq
    414 ;
    415 ; KNL_32-LABEL: test9:
    416 ; KNL_32:       # BB#0: # %entry
    417 ; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
    418 ; KNL_32-NEXT:    vpbroadcastd .LCPI8_0, %ymm3
    419 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
    420 ; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
    421 ; KNL_32-NEXT:    vpbroadcastd .LCPI8_1, %ymm3
    422 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
    423 ; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    424 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    425 ; KNL_32-NEXT:    vpbroadcastd .LCPI8_2, %ymm1
    426 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    427 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
    428 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
    429 ; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    430 ; KNL_32-NEXT:    retl
    431 ;
    432 ; SKX-LABEL: test9:
    433 ; SKX:       # BB#0: # %entry
    434 ; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
    435 ; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    436 ; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    437 ; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
    438 ; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    439 ; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    440 ; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
    441 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
    442 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    443 ; SKX-NEXT:    retq
    444 entry:
    445   %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
    446   %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
    447 
    448   %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
    449   %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
    450   ret <8 x i32> %res
    451 }
    452 
    453 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
    454 ; KNL_64-LABEL: test10:
    455 ; KNL_64:       # BB#0: # %entry
    456 ; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
    457 ; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
    458 ; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
    459 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
    460 ; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
    461 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
    462 ; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
    463 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
    464 ; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
    465 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
    466 ; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
    467 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
    468 ; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
    469 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
    470 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    471 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    472 ; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
    473 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
    474 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    475 ; KNL_64-NEXT:    retq
    476 ;
    477 ; KNL_32-LABEL: test10:
    478 ; KNL_32:       # BB#0: # %entry
    479 ; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
    480 ; KNL_32-NEXT:    vpbroadcastd .LCPI9_0, %ymm3
    481 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
    482 ; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
    483 ; KNL_32-NEXT:    vpbroadcastd .LCPI9_1, %ymm3
    484 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
    485 ; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    486 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    487 ; KNL_32-NEXT:    vpbroadcastd .LCPI9_2, %ymm1
    488 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    489 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
    490 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
    491 ; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    492 ; KNL_32-NEXT:    retl
    493 ;
    494 ; SKX-LABEL: test10:
    495 ; SKX:       # BB#0: # %entry
    496 ; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
    497 ; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    498 ; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    499 ; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
    500 ; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    501 ; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    502 ; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
    503 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
    504 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    505 ; SKX-NEXT:    retq
    506 entry:
    507   %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
    508   %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
    509 
    510   %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
    511   %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
    512   ret <8 x i32> %res
    513 }
    514 
    515 ; Splat index in GEP, requires broadcast
    516 define <16 x float> @test11(float* %base, i32 %ind) {
    517 ; KNL_64-LABEL: test11:
    518 ; KNL_64:       # BB#0:
    519 ; KNL_64-NEXT:    vpbroadcastd %esi, %zmm1
    520 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
    521 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
    522 ; KNL_64-NEXT:    retq
    523 ;
    524 ; KNL_32-LABEL: test11:
    525 ; KNL_32:       # BB#0:
    526 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    527 ; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %zmm1
    528 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
    529 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
    530 ; KNL_32-NEXT:    retl
    531 ;
    532 ; SKX-LABEL: test11:
    533 ; SKX:       # BB#0:
    534 ; SKX-NEXT:    vpbroadcastd %esi, %zmm1
    535 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
    536 ; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
    537 ; SKX-NEXT:    retq
    538 
    539   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
    540   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
    541 
    542   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
    543 
    544   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
    545   ret <16 x float>%res
    546 }
    547 
    548 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
    549 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
    550 ; KNL_64-LABEL: test12:
    551 ; KNL_64:       # BB#0:
    552 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
    553 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
    554 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
    555 ; KNL_64-NEXT:    retq
    556 ;
    557 ; KNL_32-LABEL: test12:
    558 ; KNL_32:       # BB#0:
    559 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    560 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
    561 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
    562 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
    563 ; KNL_32-NEXT:    retl
    564 ;
    565 ; SKX-LABEL: test12:
    566 ; SKX:       # BB#0:
    567 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
    568 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
    569 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    570 ; SKX-NEXT:    retq
    571 
    572   %sext_ind = sext <16 x i32> %ind to <16 x i64>
    573   %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
    574 
    575   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
    576   ret <16 x float>%res
    577 }
    578 
    579 ; The same as the previous, but the mask is undefined
    580 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
    581 ; KNL_64-LABEL: test13:
    582 ; KNL_64:       # BB#0:
    583 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
    584 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
    585 ; KNL_64-NEXT:    retq
    586 ;
    587 ; KNL_32-LABEL: test13:
    588 ; KNL_32:       # BB#0:
    589 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    590 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
    591 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
    592 ; KNL_32-NEXT:    retl
    593 ;
    594 ; SKX-LABEL: test13:
    595 ; SKX:       # BB#0:
    596 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
    597 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    598 ; SKX-NEXT:    retq
    599 
    600   %sext_ind = sext <16 x i32> %ind to <16 x i64>
    601   %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
    602 
    603   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
    604   ret <16 x float>%res
    605 }
    606 
    607 ; The base pointer is not splat, can't find unform base
    608 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
    609 ; KNL_64-LABEL: test14:
    610 ; KNL_64:       # BB#0:
    611 ; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
    612 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
    613 ; KNL_64-NEXT:    vpbroadcastq %xmm0, %zmm0
    614 ; KNL_64-NEXT:    vmovd %esi, %xmm1
    615 ; KNL_64-NEXT:    vpbroadcastd %xmm1, %ymm1
    616 ; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
    617 ; KNL_64-NEXT:    vpsllq $2, %zmm1, %zmm1
    618 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    619 ; KNL_64-NEXT:    kshiftrw $8, %k0, %k1
    620 ; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
    621 ; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
    622 ; KNL_64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
    623 ; KNL_64-NEXT:    retq
    624 ;
    625 ; KNL_32-LABEL: test14:
    626 ; KNL_32:       # BB#0:
    627 ; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
    628 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
    629 ; KNL_32-NEXT:    vpbroadcastd %xmm0, %zmm0
    630 ; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
    631 ; KNL_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
    632 ; KNL_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
    633 ; KNL_32-NEXT:    retl
    634 ;
    635 ; SKX-LABEL: test14:
    636 ; SKX:       # BB#0:
    637 ; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
    638 ; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
    639 ; SKX-NEXT:    vpbroadcastq %xmm0, %zmm0
    640 ; SKX-NEXT:    vpbroadcastd %esi, %ymm1
    641 ; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
    642 ; SKX-NEXT:    vpsllq $2, %zmm1, %zmm1
    643 ; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    644 ; SKX-NEXT:    kshiftrw $8, %k0, %k1
    645 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
    646 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
    647 ; SKX-NEXT:    vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
    648 ; SKX-NEXT:    retq
    649 ;
    650 ; SKX_32-LABEL: test14:
    651 ; SKX_32:       # BB#0:
    652 ; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
    653 ; SKX_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
    654 ; SKX_32-NEXT:    vpbroadcastd %xmm0, %zmm0
    655 ; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
    656 ; SKX_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
    657 ; SKX_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
    658 ; SKX_32-NEXT:    retl
    659 
    660   %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
    661   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
    662 
    663   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
    664 
    665   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
    666   ret <16 x float>%res
    667 }
    668 
    669 declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
    670 declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
    671 declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
    672 
    673 ; Gather smaller than existing instruction
    674 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
    675 ;
    676 ; KNL_64-LABEL: test15:
    677 ; KNL_64:       # BB#0:
    678 ; KNL_64:         vpxor %ymm2, %ymm2, %ymm2
    679 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
    680 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm2
    681 ; KNL_64-NEXT:    vpslld $31, %ymm1, %ymm0
    682 ; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k1
    683 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
    684 ; KNL_64-NEXT:    # kill
    685 ; KNL_64-NEXT:    retq
    686 ;
    687 ; KNL_32-LABEL: test15:
    688 ; KNL_32:       # BB#0:
    689 ; KNL_32:         vpxor %ymm2, %ymm2, %ymm2
    690 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
    691 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    692 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm2
    693 ; KNL_32-NEXT:    vpslld $31, %ymm1, %ymm0
    694 ; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k1
    695 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
    696 ; KNL_32-NEXT:    # kill
    697 ; KNL_32-NEXT:    retl
    698 ;
    699 ; SKX-LABEL: test15:
    700 ; SKX:       # BB#0:
    701 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
    702 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
    703 ; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
    704 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    705 ; SKX-NEXT:    retq
    706 ;
    707 ; SKX_32-LABEL: test15:
    708 ; SKX_32:       # BB#0:
    709 ; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
    710 ; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
    711 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    712 ; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
    713 ; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
    714 ; SKX_32-NEXT:    retl
    715 
    716   %sext_ind = sext <4 x i32> %ind to <4 x i64>
    717   %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
    718   %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
    719   ret <4 x float>%res
    720 }
    721 
    722 ; Gather smaller than existing instruction
    723 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
    724 ;
    725 ; KNL_64-LABEL: test16:
    726 ; KNL_64:       # BB#0:
    727 ; KNL_64:         vpslld $31, %xmm1, %xmm1
    728 ; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
    729 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
    730 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    731 ; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
    732 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
    733 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
    734 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
    735 ; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
    736 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
    737 ; KNL_64-NEXT:    retq
    738 ;
    739 ; KNL_32-LABEL: test16:
    740 ; KNL_32:       # BB#0:
    741 ; KNL_32:         vpslld $31, %xmm1, %xmm1
    742 ; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
    743 ; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
    744 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    745 ; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
    746 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    747 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
    748 ; KNL_32-NEXT:    vpsllvq .LCPI15_0, %zmm1, %zmm1
    749 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
    750 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
    751 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
    752 ; KNL_32-NEXT:    retl
    753 ;
    754 ; SKX-LABEL: test16:
    755 ; SKX:       # BB#0:
    756 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
    757 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
    758 ; SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
    759 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
    760 ; SKX-NEXT:    retq
    761 ;
    762 ; SKX_32-LABEL: test16:
    763 ; SKX_32:       # BB#0:
    764 ; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
    765 ; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
    766 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    767 ; SKX_32-NEXT:    vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
    768 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
    769 ; SKX_32-NEXT:    retl
    770 
    771   %sext_ind = sext <4 x i32> %ind to <4 x i64>
    772   %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
    773   %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
    774   ret <4 x double>%res
    775 }
    776 
    777 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
    778 ;
    779 ; KNL_64-LABEL: test17:
    780 ; KNL_64:       # BB#0:
    781 ; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
    782 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
    783 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
    784 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
    785 ; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
    786 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
    787 ; KNL_64-NEXT:    retq
    788 ;
    789 ; KNL_32-LABEL: test17:
    790 ; KNL_32:       # BB#0:
    791 ; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
    792 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
    793 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    794 ; KNL_32-NEXT:    vpsllvq .LCPI16_0, %zmm1, %zmm1
    795 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
    796 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
    797 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
    798 ; KNL_32-NEXT:    retl
    799 ;
    800 ; SKX-LABEL: test17:
    801 ; SKX:       # BB#0:
    802 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
    803 ; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
    804 ; SKX-NEXT:    vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
    805 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
    806 ; SKX-NEXT:    retq
    807 ;
    808 ; SKX_32-LABEL: test17:
    809 ; SKX_32:       # BB#0:
    810 ; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
    811 ; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
    812 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    813 ; SKX_32-NEXT:    vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
    814 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
    815 ; SKX_32-NEXT:    retl
    816 
    817   %sext_ind = sext <2 x i32> %ind to <2 x i64>
    818   %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
    819   %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
    820   ret <2 x double>%res
    821 }
    822 
    823 declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
    824 declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
    825 declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
    826 declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
    827 declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
    828 
    829 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
    830 ;
    831 ; KNL_64-LABEL: test18:
    832 ; KNL_64:       # BB#0:
    833 ; KNL_64:         vpxor %ymm3, %ymm3, %ymm3
    834 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
    835 ; KNL_64-NEXT:    vpslld $31, %ymm2, %ymm2
    836 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
    837 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    838 ; KNL_64-NEXT:    retq
    839 ;
    840 ; KNL_32-LABEL: test18:
    841 ; KNL_32:       # BB#0:
    842 ; KNL_32:         vpxor %ymm3, %ymm3, %ymm3
    843 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
    844 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
    845 ; KNL_32-NEXT:    vpslld $31, %ymm2, %ymm2
    846 ; KNL_32-NEXT:    vptestmd %zmm2, %zmm2, %k1
    847 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    848 ; KNL_32-NEXT:    retl
    849 ;
    850 ; SKX-LABEL: test18:
    851 ; SKX:       # BB#0:
    852 ; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
    853 ; SKX-NEXT:    vptestmd %xmm2, %xmm2, %k1
    854 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
    855 ; SKX-NEXT:    retq
    856 ;
    857 ; SKX_32-LABEL: test18:
    858 ; SKX_32:       # BB#0:
    859 ; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm2
    860 ; SKX_32-NEXT:    vptestmd %xmm2, %xmm2, %k1
    861 ; SKX_32-NEXT:    vpscatterdd %xmm0, (,%xmm1) {%k1}
    862 ; SKX_32-NEXT:    retl
    863   call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
    864   ret void
    865 }
    866 
    867 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
    868 ;
    869 ; KNL_64-LABEL: test19:
    870 ; KNL_64:       # BB#0:
    871 ; KNL_64:         vpslld $31, %xmm1, %xmm1
    872 ; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
    873 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
    874 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    875 ; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
    876 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
    877 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
    878 ; KNL_64-NEXT:    vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
    879 ; KNL_64-NEXT:    retq
    880 ;
    881 ; KNL_32-LABEL: test19:
    882 ; KNL_32:       # BB#0:
    883 ; KNL_32:         vpslld $31, %xmm1, %xmm1
    884 ; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
    885 ; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
    886 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    887 ; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
    888 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    889 ; KNL_32-NEXT:    vpsllvq .LCPI18_0, %zmm1, %zmm1
    890 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
    891 ; KNL_32-NEXT:    vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
    892 ; KNL_32-NEXT:    retl
    893 ;
    894 ; SKX-LABEL: test19:
    895 ; SKX:       # BB#0:
    896 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
    897 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
    898 ; SKX-NEXT:    vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
    899 ; SKX-NEXT:    retq
    900 ;
    901 ; SKX_32-LABEL: test19:
    902 ; SKX_32:       # BB#0:
    903 ; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
    904 ; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
    905 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    906 ; SKX_32-NEXT:    vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
    907 ; SKX_32-NEXT:    retl
    908   %gep = getelementptr double, double* %ptr, <4 x i64> %ind
    909   call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
    910   ret void
    911 }
    912 
    913 ; Data type requires widening
    914 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
    915 ;
    916 ; KNL_64-LABEL: test20:
    917 ; KNL_64:       # BB#0:
    918 ; KNL_64:         vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    919 ; KNL_64-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
    920 ; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
    921 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
    922 ; KNL_64-NEXT:    vpslld $31, %ymm2, %ymm2
    923 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
    924 ; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
    925 ; KNL_64-NEXT:    retq
    926 ;
    927 ; KNL_32-LABEL: test20:
    928 ; KNL_32:       # BB#0:
    929 ; KNL_32:         vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    930 ; KNL_32-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
    931 ; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
    932 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
    933 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    934 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
    935 ; KNL_32-NEXT:    vpslld $31, %ymm2, %ymm2
    936 ; KNL_32-NEXT:    vptestmd %zmm2, %zmm2, %k1
    937 ; KNL_32-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
    938 ; KNL_32-NEXT:    retl
    939 ;
    940 ; SKX-LABEL: test20:
    941 ; SKX:       # BB#0:
    942 ; SKX:         vpsllq $63, %xmm2, %xmm2
    943 ; SKX-NEXT:    vptestmq %xmm2, %xmm2, %k0
    944 ; SKX-NEXT:    kshiftlb $6, %k0, %k0
    945 ; SKX-NEXT:    kshiftrb $6, %k0, %k1
    946 ; SKX-NEXT:    vscatterqps %xmm0, (,%ymm1) {%k1}
    947 ; SKX-NEXT:    retq
    948 ;
    949 ; SKX_32-LABEL: test20:
    950 ; SKX_32:       # BB#0:
    951 ; SKX_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    952 ; SKX_32-NEXT:    vpsllq $63, %xmm2, %xmm2
    953 ; SKX_32-NEXT:    vptestmq %xmm2, %xmm2, %k0
    954 ; SKX_32-NEXT:    kshiftlb $6, %k0, %k0
    955 ; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
    956 ; SKX_32-NEXT:    vscatterdps %xmm0, (,%xmm1) {%k1}
    957 ; SKX_32-NEXT:    retl
    958   call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
    959   ret void
    960 }
    961 
    962 ; Data type requires promotion
    963 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
    964 ;
    965 ; KNL_64-LABEL: test21:
    966 ; KNL_64:       # BB#0:
    967 ; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
    968 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
    969 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    970 ; KNL_64-NEXT:    vpsllq $63, %zmm2, %zmm2
    971 ; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
    972 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    973 ; KNL_64-NEXT:    retq
    974 ;
    975 ; KNL_32-LABEL: test21:
    976 ; KNL_32:       # BB#0:
    977 ; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
    978 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
    979 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    980 ; KNL_32-NEXT:    vpsllvq .LCPI20_0, %zmm2, %zmm2
    981 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
    982 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    983 ; KNL_32-NEXT:    retl
    984 ;
    985 ; SKX-LABEL: test21:
    986 ; SKX:       # BB#0:
    987 ; SKX:         vpsllq $63, %xmm2, %xmm2
    988 ; SKX-NEXT:    vptestmq %xmm2, %xmm2, %k0
    989 ; SKX-NEXT:    kshiftlb $6, %k0, %k0
    990 ; SKX-NEXT:    kshiftrb $6, %k0, %k1
    991 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    992 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
    993 ; SKX-NEXT:    retq
    994 ;
    995 ; SKX_32-LABEL: test21:
    996 ; SKX_32:       # BB#0:
    997 ; SKX_32:         vpsllq $63, %xmm2, %xmm2
    998 ; SKX_32-NEXT:    vptestmq %xmm2, %xmm2, %k0
    999 ; SKX_32-NEXT:    kshiftlb $6, %k0, %k0
   1000 ; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
   1001 ; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1002 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
   1003 ; SKX_32-NEXT:    retl
   1004   call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
   1005   ret void
   1006 }
   1007 
   1008 ; The result type requires widening
   1009 declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
   1010 
   1011 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
   1012 ;
   1013 ;
   1014 ; KNL_64-LABEL: test22:
   1015 ; KNL_64:       # BB#0:
   1016 ; KNL_64:         vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1017 ; KNL_64-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
   1018 ; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
   1019 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
   1020 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1021 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
   1022 ; KNL_64-NEXT:    vpslld $31, %ymm1, %ymm1
   1023 ; KNL_64-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1024 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
   1025 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
   1026 ; KNL_64-NEXT:    retq
   1027 ;
   1028 ; KNL_32-LABEL: test22:
   1029 ; KNL_32:       # BB#0:
   1030 ; KNL_32:         vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1031 ; KNL_32-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
   1032 ; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
   1033 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
   1034 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1035 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1036 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
   1037 ; KNL_32-NEXT:    vpslld $31, %ymm1, %ymm1
   1038 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1039 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
   1040 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1041 ; KNL_32-NEXT:    retl
   1042 ;
   1043 ; SKX-LABEL: test22:
   1044 ; SKX:       # BB#0:
   1045 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1046 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
   1047 ; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k0
   1048 ; SKX-NEXT:    kshiftlb $6, %k0, %k0
   1049 ; SKX-NEXT:    kshiftrb $6, %k0, %k1
   1050 ; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
   1051 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
   1052 ; SKX-NEXT:    retq
   1053 ;
   1054 ; SKX_32-LABEL: test22:
   1055 ; SKX_32:       # BB#0:
   1056 ; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1057 ; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
   1058 ; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k0
   1059 ; SKX_32-NEXT:    kshiftlb $6, %k0, %k0
   1060 ; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
   1061 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1062 ; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
   1063 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
   1064 ; SKX_32-NEXT:    retl
   1065   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1066   %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
   1067   %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
   1068   ret <2 x float>%res
   1069 }
   1070 
   1071 declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
   1072 declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
   1073 
   1074 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
   1075 ;
   1076 ; KNL_64-LABEL: test23:
   1077 ; KNL_64:       # BB#0:
   1078 ; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
   1079 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
   1080 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
   1081 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1082 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
   1083 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
   1084 ; KNL_64-NEXT:    retq
   1085 ;
   1086 ; KNL_32-LABEL: test23:
   1087 ; KNL_32:       # BB#0:
   1088 ; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
   1089 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
   1090 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1091 ; KNL_32-NEXT:    vpsllvq .LCPI22_0, %zmm1, %zmm1
   1092 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1093 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
   1094 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1095 ; KNL_32-NEXT:    retl
   1096 ;
   1097 ; SKX-LABEL: test23:
   1098 ; SKX:       # BB#0:
   1099 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
   1100 ; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
   1101 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
   1102 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
   1103 ; SKX-NEXT:    retq
   1104 ;
   1105 ; SKX_32-LABEL: test23:
   1106 ; SKX_32:       # BB#0:
   1107 ; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
   1108 ; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
   1109 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1110 ; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
   1111 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
   1112 ; SKX_32-NEXT:    retl
   1113   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1114   %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
   1115   %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
   1116   ret <2 x i32>%res
   1117 }
   1118 
   1119 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
   1120 ; KNL_64-LABEL: test24:
   1121 ; KNL_64:       # BB#0:
   1122 ; KNL_64:         movb $3, %al
   1123 ; KNL_64-NEXT:    kmovw %eax, %k1
   1124 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
   1125 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
   1126 ; KNL_64-NEXT:    retq
   1127 ;
   1128 ; KNL_32-LABEL: test24:
   1129 ; KNL_32:       # BB#0:
   1130 ; KNL_32:         movl {{[0-9]+}}(%esp), %eax
   1131 ; KNL_32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
   1132 ; KNL_32-NEXT:    vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
   1133 ; KNL_32-NEXT:    vpsllvq .LCPI23_1, %zmm1, %zmm1
   1134 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1135 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
   1136 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
   1137 ; KNL_32-NEXT:    retl
   1138 ;
   1139 ; SKX-LABEL: test24:
   1140 ; SKX:       # BB#0:
   1141 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
   1142 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
   1143 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
   1144 ; SKX-NEXT:    retq
   1145 ;
   1146 ; SKX_32-LABEL: test24:
   1147 ; SKX_32:       # BB#0:
   1148 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1149 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
   1150 ; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
   1151 ; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
   1152 ; SKX_32-NEXT:    retl
   1153   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1154   %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
   1155   %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
   1156   ret <2 x i32>%res
   1157 }
   1158 
   1159 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
   1160 ;
   1161 ; KNL_64-LABEL: test25:
   1162 ; KNL_64:       # BB#0:
   1163 ; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
   1164 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
   1165 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
   1166 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1167 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
   1168 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
   1169 ; KNL_64-NEXT:    retq
   1170 ;
   1171 ; KNL_32-LABEL: test25:
   1172 ; KNL_32:       # BB#0:
   1173 ; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
   1174 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
   1175 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1176 ; KNL_32-NEXT:    vpsllvq .LCPI24_0, %zmm1, %zmm1
   1177 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1178 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
   1179 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1180 ; KNL_32-NEXT:    retl
   1181 ;
   1182 ; SKX-LABEL: test25:
   1183 ; SKX:       # BB#0:
   1184 ; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
   1185 ; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
   1186 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
   1187 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
   1188 ; SKX-NEXT:    retq
   1189 ;
   1190 ; SKX_32-LABEL: test25:
   1191 ; SKX_32:       # BB#0:
   1192 ; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
   1193 ; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
   1194 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1195 ; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
   1196 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
   1197 ; SKX_32-NEXT:    retl
   1198   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1199   %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
   1200   %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
   1201   ret <2 x i64>%res
   1202 }
   1203 
   1204 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
   1205 ;
   1206 ; KNL_64-LABEL: test26:
   1207 ; KNL_64:       # BB#0:
   1208 ; KNL_64:         movb $3, %al
   1209 ; KNL_64-NEXT:    kmovw %eax, %k1
   1210 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
   1211 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
   1212 ; KNL_64-NEXT:    retq
   1213 ;
   1214 ; KNL_32-LABEL: test26:
   1215 ; KNL_32:       # BB#0:
   1216 ; KNL_32:         movl {{[0-9]+}}(%esp), %eax
   1217 ; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
   1218 ; KNL_32-NEXT:    vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
   1219 ; KNL_32-NEXT:    vpsllvq .LCPI25_1, %zmm2, %zmm2
   1220 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
   1221 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
   1222 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
   1223 ; KNL_32-NEXT:    retl
   1224 ;
   1225 ; SKX-LABEL: test26:
   1226 ; SKX:       # BB#0:
   1227 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
   1228 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
   1229 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
   1230 ; SKX-NEXT:    retq
   1231 ;
   1232 ; SKX_32-LABEL: test26:
   1233 ; SKX_32:       # BB#0:
   1234 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1235 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
   1236 ; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
   1237 ; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
   1238 ; SKX_32-NEXT:    retl
   1239   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1240   %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
   1241   %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
   1242   ret <2 x i64>%res
   1243 }
   1244 
   1245 ; Result type requires widening; all-ones mask
   1246 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
   1247 ;
   1248 ; KNL_64-LABEL: test27:
   1249 ; KNL_64:       # BB#0:
   1250 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1251 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
   1252 ; KNL_64-NEXT:    movb $3, %al
   1253 ; KNL_64-NEXT:    kmovw %eax, %k1
   1254 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
   1255 ; KNL_64-NEXT:    # kill
   1256 ; KNL_64-NEXT:    retq
   1257 ;
   1258 ; KNL_32-LABEL: test27:
   1259 ; KNL_32:       # BB#0:
   1260 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1261 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1262 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
   1263 ; KNL_32-NEXT:    movb $3, %cl
   1264 ; KNL_32-NEXT:    kmovw %ecx, %k1
   1265 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
   1266 ; KNL_32-NEXT:    # kill
   1267 ; KNL_32-NEXT:    retl
   1268 ;
   1269 ; SKX-LABEL: test27:
   1270 ; SKX:       # BB#0:
   1271 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
   1272 ; SKX-NEXT:    movb $3, %al
   1273 ; SKX-NEXT:    kmovb %eax, %k1
   1274 ; SKX-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
   1275 ; SKX-NEXT:    retq
   1276   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1277   %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
   1278   %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
   1279   ret <2 x float>%res
   1280 }
   1281 
   1282 ; Data type requires promotion, mask is all-ones
   1283 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
   1284 ;
   1285 ;
   1286 ; KNL_64-LABEL: test28:
   1287 ; KNL_64:       # BB#0:
   1288 ; KNL_64:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1289 ; KNL_64-NEXT:    movb $3, %al
   1290 ; KNL_64-NEXT:    kmovw %eax, %k1
   1291 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
   1292 ; KNL_64-NEXT:    retq
   1293 ;
   1294 ; KNL_32-LABEL: test28:
   1295 ; KNL_32:       # BB#0:
   1296 ; KNL_32:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1297 ; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
   1298 ; KNL_32-NEXT:    vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
   1299 ; KNL_32-NEXT:    vpsllvq .LCPI27_1, %zmm2, %zmm2
   1300 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
   1301 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
   1302 ; KNL_32-NEXT:    retl
   1303 ;
   1304 ; SKX-LABEL: test28:
   1305 ; SKX:       # BB#0:
   1306 ; SKX:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1307 ; SKX-NEXT:    movb $3, %al
   1308 ; SKX-NEXT:    kmovb %eax, %k1
   1309 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
   1310 ; SKX-NEXT:    retq
   1311 ;
   1312 ; SKX_32-LABEL: test28:
   1313 ; SKX_32:       # BB#0:
   1314 ; SKX_32:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1315 ; SKX_32-NEXT:    movb $3, %al
   1316 ; SKX_32-NEXT:    kmovb %eax, %k1
   1317 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
   1318 ; SKX_32-NEXT:    retl
   1319   call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
   1320   ret void
   1321 }
   1322 
   1323 
   1324 ; SCALAR-LABEL: test29
   1325 ; SCALAR:      extractelement <16 x float*>
   1326 ; SCALAR-NEXT: load float
   1327 ; SCALAR-NEXT: insertelement <16 x float>
   1328 ; SCALAR-NEXT: extractelement <16 x float*>
   1329 ; SCALAR-NEXT: load float
   1330 
   1331 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
   1332 ; KNL_64-LABEL: test29:
   1333 ; KNL_64:       # BB#0:
   1334 ; KNL_64-NEXT:    movw $44, %ax
   1335 ; KNL_64-NEXT:    kmovw %eax, %k1
   1336 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
   1337 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
   1338 ; KNL_64-NEXT:    retq
   1339 ;
   1340 ; KNL_32-LABEL: test29:
   1341 ; KNL_32:       # BB#0:
   1342 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1343 ; KNL_32-NEXT:    movw $44, %cx
   1344 ; KNL_32-NEXT:    kmovw %ecx, %k1
   1345 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
   1346 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
   1347 ; KNL_32-NEXT:    retl
   1348 ;
   1349 ; SKX-LABEL: test29:
   1350 ; SKX:       # BB#0:
   1351 ; SKX-NEXT:    movw $44, %ax
   1352 ; SKX-NEXT:    kmovw %eax, %k1
   1353 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
   1354 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
   1355 ; SKX-NEXT:    retq
   1356 
   1357   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
   1358   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
   1359 
   1360   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   1361   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
   1362 
   1363   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
   1364   ret <16 x float>%res
   1365 }
   1366 
   1367 ; Check non-power-of-2 case. It should be scalarized.
   1368 declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
   1369 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
   1370 ; KNL_64-LABEL: test30:
   1371 ; KNL_64:       # BB#0:
   1372 ; KNL_64-NEXT:    andl $1, %edx
   1373 ; KNL_64-NEXT:    andl $1, %esi
   1374 ; KNL_64-NEXT:    movl %edi, %eax
   1375 ; KNL_64-NEXT:    andl $1, %eax
   1376 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
   1377 ; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
   1378 ; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
   1379 ; KNL_64-NEXT:    # implicit-def: %XMM0
   1380 ; KNL_64-NEXT:    testb $1, %dil
   1381 ; KNL_64-NEXT:    je .LBB29_2
   1382 ; KNL_64-NEXT:  # BB#1: # %cond.load
   1383 ; KNL_64-NEXT:    vmovq %xmm1, %rcx
   1384 ; KNL_64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1385 ; KNL_64-NEXT:  .LBB29_2: # %else
   1386 ; KNL_64-NEXT:    testb %sil, %sil
   1387 ; KNL_64-NEXT:    je .LBB29_4
   1388 ; KNL_64-NEXT:  # BB#3: # %cond.load1
   1389 ; KNL_64-NEXT:    vpextrq $1, %xmm1, %rcx
   1390 ; KNL_64-NEXT:    vpinsrd $1, (%rcx), %xmm0, %xmm0
   1391 ; KNL_64-NEXT:  .LBB29_4: # %else2
   1392 ; KNL_64-NEXT:    testb %dl, %dl
   1393 ; KNL_64-NEXT:    je .LBB29_6
   1394 ; KNL_64-NEXT:  # BB#5: # %cond.load4
   1395 ; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
   1396 ; KNL_64-NEXT:    vmovq %xmm1, %rcx
   1397 ; KNL_64-NEXT:    vpinsrd $2, (%rcx), %xmm0, %xmm0
   1398 ; KNL_64-NEXT:  .LBB29_6: # %else5
   1399 ; KNL_64-NEXT:    vmovd %eax, %xmm1
   1400 ; KNL_64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
   1401 ; KNL_64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
   1402 ; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
   1403 ; KNL_64-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
   1404 ; KNL_64-NEXT:    retq
   1405 ;
   1406 ; KNL_32-LABEL: test30:
   1407 ; KNL_32:       # BB#0:
   1408 ; KNL_32-NEXT:    pushl %ebx
   1409 ; KNL_32-NEXT:  .Ltmp0:
   1410 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1411 ; KNL_32-NEXT:    pushl %esi
   1412 ; KNL_32-NEXT:  .Ltmp1:
   1413 ; KNL_32-NEXT:    .cfi_def_cfa_offset 12
   1414 ; KNL_32-NEXT:  .Ltmp2:
   1415 ; KNL_32-NEXT:    .cfi_offset %esi, -12
   1416 ; KNL_32-NEXT:  .Ltmp3:
   1417 ; KNL_32-NEXT:    .cfi_offset %ebx, -8
   1418 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1419 ; KNL_32-NEXT:    andl $1, %eax
   1420 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1421 ; KNL_32-NEXT:    andl $1, %ecx
   1422 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
   1423 ; KNL_32-NEXT:    movl %ebx, %edx
   1424 ; KNL_32-NEXT:    andl $1, %edx
   1425 ; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
   1426 ; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   1427 ; KNL_32-NEXT:    # implicit-def: %XMM0
   1428 ; KNL_32-NEXT:    testb $1, %bl
   1429 ; KNL_32-NEXT:    je .LBB29_2
   1430 ; KNL_32-NEXT:  # BB#1: # %cond.load
   1431 ; KNL_32-NEXT:    vmovd %xmm1, %esi
   1432 ; KNL_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1433 ; KNL_32-NEXT:  .LBB29_2: # %else
   1434 ; KNL_32-NEXT:    testb %cl, %cl
   1435 ; KNL_32-NEXT:    je .LBB29_4
   1436 ; KNL_32-NEXT:  # BB#3: # %cond.load1
   1437 ; KNL_32-NEXT:    vpextrd $1, %xmm1, %esi
   1438 ; KNL_32-NEXT:    vpinsrd $1, (%esi), %xmm0, %xmm0
   1439 ; KNL_32-NEXT:  .LBB29_4: # %else2
   1440 ; KNL_32-NEXT:    testb %al, %al
   1441 ; KNL_32-NEXT:    je .LBB29_6
   1442 ; KNL_32-NEXT:  # BB#5: # %cond.load4
   1443 ; KNL_32-NEXT:    vpextrd $2, %xmm1, %esi
   1444 ; KNL_32-NEXT:    vpinsrd $2, (%esi), %xmm0, %xmm0
   1445 ; KNL_32-NEXT:  .LBB29_6: # %else5
   1446 ; KNL_32-NEXT:    vmovd %edx, %xmm1
   1447 ; KNL_32-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
   1448 ; KNL_32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
   1449 ; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
   1450 ; KNL_32-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
   1451 ; KNL_32-NEXT:    popl %esi
   1452 ; KNL_32-NEXT:    popl %ebx
   1453 ; KNL_32-NEXT:    retl
   1454 ;
   1455 ; SKX-LABEL: test30:
   1456 ; SKX:       # BB#0:
   1457 ; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
   1458 ; SKX-NEXT:    vptestmd %xmm2, %xmm2, %k1
   1459 ; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
   1460 ; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
   1461 ; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
   1462 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
   1463 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
   1464 ; SKX-NEXT:    # implicit-def: %XMM0
   1465 ; SKX-NEXT:    testb %al, %al
   1466 ; SKX-NEXT:    je .LBB29_2
   1467 ; SKX-NEXT:  # BB#1: # %cond.load
   1468 ; SKX-NEXT:    vmovq %xmm1, %rax
   1469 ; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1470 ; SKX-NEXT:  .LBB29_2: # %else
   1471 ; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
   1472 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
   1473 ; SKX-NEXT:    testb %al, %al
   1474 ; SKX-NEXT:    je .LBB29_4
   1475 ; SKX-NEXT:  # BB#3: # %cond.load1
   1476 ; SKX-NEXT:    vpextrq $1, %xmm1, %rax
   1477 ; SKX-NEXT:    vpinsrd $1, (%rax), %xmm0, %xmm0
   1478 ; SKX-NEXT:  .LBB29_4: # %else2
   1479 ; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
   1480 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
   1481 ; SKX-NEXT:    testb %al, %al
   1482 ; SKX-NEXT:    je .LBB29_6
   1483 ; SKX-NEXT:  # BB#5: # %cond.load4
   1484 ; SKX-NEXT:    vextracti64x2 $1, %ymm1, %xmm1
   1485 ; SKX-NEXT:    vmovq %xmm1, %rax
   1486 ; SKX-NEXT:    vpinsrd $2, (%rax), %xmm0, %xmm0
   1487 ; SKX-NEXT:  .LBB29_6: # %else5
   1488 ; SKX-NEXT:    vpblendmd %xmm0, %xmm3, %xmm0 {%k1}
   1489 ; SKX-NEXT:    retq
   1490 ;
   1491 ; SKX_32-LABEL: test30:
   1492 ; SKX_32:       # BB#0:
   1493 ; SKX_32-NEXT:    subl $12, %esp
   1494 ; SKX_32-NEXT:  .Ltmp0:
   1495 ; SKX_32-NEXT:    .cfi_def_cfa_offset 16
   1496 ; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm2
   1497 ; SKX_32-NEXT:    vptestmd %xmm2, %xmm2, %k1
   1498 ; SKX_32-NEXT:    kmovb %k1, {{[0-9]+}}(%esp)
   1499 ; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
   1500 ; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   1501 ; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1502 ; SKX_32-NEXT:    # implicit-def: %XMM0
   1503 ; SKX_32-NEXT:    testb %al, %al
   1504 ; SKX_32-NEXT:    je .LBB29_2
   1505 ; SKX_32-NEXT:  # BB#1: # %cond.load
   1506 ; SKX_32-NEXT:    vmovd %xmm1, %eax
   1507 ; SKX_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1508 ; SKX_32-NEXT:  .LBB29_2: # %else
   1509 ; SKX_32-NEXT:    kmovb %k1, {{[0-9]+}}(%esp)
   1510 ; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1511 ; SKX_32-NEXT:    testb %al, %al
   1512 ; SKX_32-NEXT:    je .LBB29_4
   1513 ; SKX_32-NEXT:  # BB#3: # %cond.load1
   1514 ; SKX_32-NEXT:    vpextrd $1, %xmm1, %eax
   1515 ; SKX_32-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
   1516 ; SKX_32-NEXT:  .LBB29_4: # %else2
   1517 ; SKX_32-NEXT:    vmovdqa32 {{[0-9]+}}(%esp), %xmm2
   1518 ; SKX_32-NEXT:    kmovb %k1, (%esp)
   1519 ; SKX_32-NEXT:    movb (%esp), %al
   1520 ; SKX_32-NEXT:    testb %al, %al
   1521 ; SKX_32-NEXT:    je .LBB29_6
   1522 ; SKX_32-NEXT:  # BB#5: # %cond.load4
   1523 ; SKX_32-NEXT:    vpextrd $2, %xmm1, %eax
   1524 ; SKX_32-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0
   1525 ; SKX_32-NEXT:  .LBB29_6: # %else5
   1526 ; SKX_32-NEXT:    vpblendmd %xmm0, %xmm2, %xmm0 {%k1}
   1527 ; SKX_32-NEXT:    addl $12, %esp
   1528 ; SKX_32-NEXT:    retl
   1529 
   1530   %sext_ind = sext <3 x i32> %ind to <3 x i64>
   1531   %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
   1532   %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
   1533   ret <3 x i32>%res
   1534 }
   1535 
   1536 declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
   1537 
   1538 ; KNL-LABEL: test31
   1539 ; KNL: vpgatherqq
   1540 ; KNL: vpgatherqq
   1541 define <16 x float*> @test31(<16 x float**> %ptrs) {
   1542 ; KNL_64-LABEL: test31:
   1543 ; KNL_64:       # BB#0:
   1544 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
   1545 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
   1546 ; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
   1547 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k1
   1548 ; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
   1549 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
   1550 ; KNL_64-NEXT:    vmovaps %zmm3, %zmm1
   1551 ; KNL_64-NEXT:    retq
   1552 ;
   1553 ; KNL_32-LABEL: test31:
   1554 ; KNL_32:       # BB#0:
   1555 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
   1556 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
   1557 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
   1558 ; KNL_32-NEXT:    retl
   1559 ;
   1560 ; SKX-LABEL: test31:
   1561 ; SKX:       # BB#0:
   1562 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
   1563 ; SKX-NEXT:    kxnorw %k0, %k0, %k2
   1564 ; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
   1565 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
   1566 ; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
   1567 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
   1568 ; SKX-NEXT:    vmovaps %zmm3, %zmm1
   1569 ; SKX-NEXT:    retq
   1570 ;
   1571 ; SKX_32-LABEL: test31:
   1572 ; SKX_32:       # BB#0:
   1573 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
   1574 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
   1575 ; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
   1576 ; SKX_32-NEXT:    retl
   1577 
   1578   %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
   1579   ret <16 x float*>%res
   1580 }
   1581 
   1582 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
   1583 ; KNL_64-LABEL: test_gather_16i32:
   1584 ; KNL_64:       # BB#0:
   1585 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1586 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
   1587 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1588 ; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
   1589 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1590 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
   1591 ; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
   1592 ; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
   1593 ; KNL_64-NEXT:    retq
   1594 ;
   1595 ; KNL_32-LABEL: test_gather_16i32:
   1596 ; KNL_32:       # BB#0:
   1597 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1598 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1599 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1600 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
   1601 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1602 ; KNL_32-NEXT:    retl
   1603 ;
   1604 ; SKX-LABEL: test_gather_16i32:
   1605 ; SKX:       # BB#0:
   1606 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1607 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
   1608 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1609 ; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm2
   1610 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1611 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
   1612 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
   1613 ; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
   1614 ; SKX-NEXT:    retq
   1615 ;
   1616 ; SKX_32-LABEL: test_gather_16i32:
   1617 ; SKX_32:       # BB#0:
   1618 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1619 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1620 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1621 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
   1622 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
   1623 ; SKX_32-NEXT:    retl
   1624   %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
   1625   ret <16 x i32> %res
   1626 }
   1627 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
   1628 ; KNL_64-LABEL: test_gather_16i64:
   1629 ; KNL_64:       # BB#0:
   1630 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1631 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
   1632 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1633 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1634 ; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
   1635 ; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
   1636 ; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
   1637 ; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
   1638 ; KNL_64-NEXT:    retq
   1639 ;
   1640 ; KNL_32-LABEL: test_gather_16i64:
   1641 ; KNL_32:       # BB#0:
   1642 ; KNL_32-NEXT:    pushl %ebp
   1643 ; KNL_32-NEXT:  .Ltmp4:
   1644 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1645 ; KNL_32-NEXT:  .Ltmp5:
   1646 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
   1647 ; KNL_32-NEXT:    movl %esp, %ebp
   1648 ; KNL_32-NEXT:  .Ltmp6:
   1649 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
   1650 ; KNL_32-NEXT:    andl $-64, %esp
   1651 ; KNL_32-NEXT:    subl $64, %esp
   1652 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1653 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1654 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1655 ; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
   1656 ; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
   1657 ; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
   1658 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1659 ; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
   1660 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1661 ; KNL_32-NEXT:    movl %ebp, %esp
   1662 ; KNL_32-NEXT:    popl %ebp
   1663 ; KNL_32-NEXT:    retl
   1664 ;
   1665 ; SKX-LABEL: test_gather_16i64:
   1666 ; SKX:       # BB#0:
   1667 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1668 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
   1669 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1670 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1671 ; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
   1672 ; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
   1673 ; SKX-NEXT:    vmovaps %zmm3, %zmm0
   1674 ; SKX-NEXT:    vmovaps %zmm4, %zmm1
   1675 ; SKX-NEXT:    retq
   1676 ;
   1677 ; SKX_32-LABEL: test_gather_16i64:
   1678 ; SKX_32:       # BB#0:
   1679 ; SKX_32-NEXT:    pushl %ebp
   1680 ; SKX_32-NEXT:  .Ltmp1:
   1681 ; SKX_32-NEXT:    .cfi_def_cfa_offset 8
   1682 ; SKX_32-NEXT:  .Ltmp2:
   1683 ; SKX_32-NEXT:    .cfi_offset %ebp, -8
   1684 ; SKX_32-NEXT:    movl %esp, %ebp
   1685 ; SKX_32-NEXT:  .Ltmp3:
   1686 ; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
   1687 ; SKX_32-NEXT:    andl $-64, %esp
   1688 ; SKX_32-NEXT:    subl $64, %esp
   1689 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1690 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1691 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1692 ; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
   1693 ; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
   1694 ; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
   1695 ; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
   1696 ; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
   1697 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
   1698 ; SKX_32-NEXT:    movl %ebp, %esp
   1699 ; SKX_32-NEXT:    popl %ebp
   1700 ; SKX_32-NEXT:    retl
   1701   %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
   1702   ret <16 x i64> %res
   1703 }
   1704 declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
   1705 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
   1706 ; KNL_64-LABEL: test_gather_16f32:
   1707 ; KNL_64:       # BB#0:
   1708 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1709 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
   1710 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1711 ; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
   1712 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1713 ; KNL_64-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
   1714 ; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
   1715 ; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
   1716 ; KNL_64-NEXT:    retq
   1717 ;
   1718 ; KNL_32-LABEL: test_gather_16f32:
   1719 ; KNL_32:       # BB#0:
   1720 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1721 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1722 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1723 ; KNL_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
   1724 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1725 ; KNL_32-NEXT:    retl
   1726 ;
   1727 ; SKX-LABEL: test_gather_16f32:
   1728 ; SKX:       # BB#0:
   1729 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1730 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
   1731 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1732 ; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm2
   1733 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1734 ; SKX-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
   1735 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
   1736 ; SKX-NEXT:    vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
   1737 ; SKX-NEXT:    retq
   1738 ;
   1739 ; SKX_32-LABEL: test_gather_16f32:
   1740 ; SKX_32:       # BB#0:
   1741 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1742 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1743 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1744 ; SKX_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
   1745 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
   1746 ; SKX_32-NEXT:    retl
   1747   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
   1748   ret <16 x float> %res
   1749 }
   1750 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
   1751 ; KNL_64-LABEL: test_gather_16f64:
   1752 ; KNL_64:       # BB#0:
   1753 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1754 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
   1755 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1756 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1757 ; KNL_64-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
   1758 ; KNL_64-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
   1759 ; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
   1760 ; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
   1761 ; KNL_64-NEXT:    retq
   1762 ;
   1763 ; KNL_32-LABEL: test_gather_16f64:
   1764 ; KNL_32:       # BB#0:
   1765 ; KNL_32-NEXT:    pushl %ebp
   1766 ; KNL_32-NEXT:  .Ltmp7:
   1767 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1768 ; KNL_32-NEXT:  .Ltmp8:
   1769 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
   1770 ; KNL_32-NEXT:    movl %esp, %ebp
   1771 ; KNL_32-NEXT:  .Ltmp9:
   1772 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
   1773 ; KNL_32-NEXT:    andl $-64, %esp
   1774 ; KNL_32-NEXT:    subl $64, %esp
   1775 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1776 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1777 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1778 ; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
   1779 ; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
   1780 ; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
   1781 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1782 ; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
   1783 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1784 ; KNL_32-NEXT:    movl %ebp, %esp
   1785 ; KNL_32-NEXT:    popl %ebp
   1786 ; KNL_32-NEXT:    retl
   1787 ;
   1788 ; SKX-LABEL: test_gather_16f64:
   1789 ; SKX:       # BB#0:
   1790 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1791 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
   1792 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1793 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1794 ; SKX-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
   1795 ; SKX-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
   1796 ; SKX-NEXT:    vmovaps %zmm3, %zmm0
   1797 ; SKX-NEXT:    vmovaps %zmm4, %zmm1
   1798 ; SKX-NEXT:    retq
   1799 ;
   1800 ; SKX_32-LABEL: test_gather_16f64:
   1801 ; SKX_32:       # BB#0:
   1802 ; SKX_32-NEXT:    pushl %ebp
   1803 ; SKX_32-NEXT:  .Ltmp4:
   1804 ; SKX_32-NEXT:    .cfi_def_cfa_offset 8
   1805 ; SKX_32-NEXT:  .Ltmp5:
   1806 ; SKX_32-NEXT:    .cfi_offset %ebp, -8
   1807 ; SKX_32-NEXT:    movl %esp, %ebp
   1808 ; SKX_32-NEXT:  .Ltmp6:
   1809 ; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
   1810 ; SKX_32-NEXT:    andl $-64, %esp
   1811 ; SKX_32-NEXT:    subl $64, %esp
   1812 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1813 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1814 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1815 ; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
   1816 ; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
   1817 ; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
   1818 ; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
   1819 ; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
   1820 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
   1821 ; SKX_32-NEXT:    movl %ebp, %esp
   1822 ; SKX_32-NEXT:    popl %ebp
   1823 ; SKX_32-NEXT:    retl
   1824   %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
   1825   ret <16 x double> %res
   1826 }
   1827 declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
   1828 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
   1829 ; KNL_64-LABEL: test_scatter_16i32:
   1830 ; KNL_64:       # BB#0:
   1831 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1832 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
   1833 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1834 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1835 ; KNL_64-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
   1836 ; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
   1837 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
   1838 ; KNL_64-NEXT:    retq
   1839 ;
   1840 ; KNL_32-LABEL: test_scatter_16i32:
   1841 ; KNL_32:       # BB#0:
   1842 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1843 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1844 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1845 ; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
   1846 ; KNL_32-NEXT:    retl
   1847 ;
   1848 ; SKX-LABEL: test_scatter_16i32:
   1849 ; SKX:       # BB#0:
   1850 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1851 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
   1852 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1853 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1854 ; SKX-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
   1855 ; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm0
   1856 ; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
   1857 ; SKX-NEXT:    retq
   1858 ;
   1859 ; SKX_32-LABEL: test_scatter_16i32:
   1860 ; SKX_32:       # BB#0:
   1861 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1862 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1863 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1864 ; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
   1865 ; SKX_32-NEXT:    retl
   1866   call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
   1867   ret void
   1868 }
   1869 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
   1870 ; KNL_64-LABEL: test_scatter_16i64:
   1871 ; KNL_64:       # BB#0:
   1872 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1873 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
   1874 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1875 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1876 ; KNL_64-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
   1877 ; KNL_64-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
   1878 ; KNL_64-NEXT:    retq
   1879 ;
   1880 ; KNL_32-LABEL: test_scatter_16i64:
   1881 ; KNL_32:       # BB#0:
   1882 ; KNL_32-NEXT:    pushl %ebp
   1883 ; KNL_32-NEXT:  .Ltmp10:
   1884 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1885 ; KNL_32-NEXT:  .Ltmp11:
   1886 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
   1887 ; KNL_32-NEXT:    movl %esp, %ebp
   1888 ; KNL_32-NEXT:  .Ltmp12:
   1889 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
   1890 ; KNL_32-NEXT:    andl $-64, %esp
   1891 ; KNL_32-NEXT:    subl $64, %esp
   1892 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1893 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1894 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1895 ; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
   1896 ; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
   1897 ; KNL_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
   1898 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1899 ; KNL_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
   1900 ; KNL_32-NEXT:    movl %ebp, %esp
   1901 ; KNL_32-NEXT:    popl %ebp
   1902 ; KNL_32-NEXT:    retl
   1903 ;
   1904 ; SKX-LABEL: test_scatter_16i64:
   1905 ; SKX:       # BB#0:
   1906 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1907 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
   1908 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1909 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1910 ; SKX-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
   1911 ; SKX-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
   1912 ; SKX-NEXT:    retq
   1913 ;
   1914 ; SKX_32-LABEL: test_scatter_16i64:
   1915 ; SKX_32:       # BB#0:
   1916 ; SKX_32-NEXT:    pushl %ebp
   1917 ; SKX_32-NEXT:  .Ltmp7:
   1918 ; SKX_32-NEXT:    .cfi_def_cfa_offset 8
   1919 ; SKX_32-NEXT:  .Ltmp8:
   1920 ; SKX_32-NEXT:    .cfi_offset %ebp, -8
   1921 ; SKX_32-NEXT:    movl %esp, %ebp
   1922 ; SKX_32-NEXT:  .Ltmp9:
   1923 ; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
   1924 ; SKX_32-NEXT:    andl $-64, %esp
   1925 ; SKX_32-NEXT:    subl $64, %esp
   1926 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1927 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1928 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1929 ; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
   1930 ; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
   1931 ; SKX_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
   1932 ; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
   1933 ; SKX_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
   1934 ; SKX_32-NEXT:    movl %ebp, %esp
   1935 ; SKX_32-NEXT:    popl %ebp
   1936 ; SKX_32-NEXT:    retl
   1937   call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
   1938   ret void
   1939 }
   1940 declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
   1941 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
   1942 ; KNL_64-LABEL: test_scatter_16f32:
   1943 ; KNL_64:       # BB#0:
   1944 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1945 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
   1946 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1947 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1948 ; KNL_64-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
   1949 ; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
   1950 ; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
   1951 ; KNL_64-NEXT:    retq
   1952 ;
   1953 ; KNL_32-LABEL: test_scatter_16f32:
   1954 ; KNL_32:       # BB#0:
   1955 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1956 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1957 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1958 ; KNL_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
   1959 ; KNL_32-NEXT:    retl
   1960 ;
   1961 ; SKX-LABEL: test_scatter_16f32:
   1962 ; SKX:       # BB#0:
   1963 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1964 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
   1965 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1966 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1967 ; SKX-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
   1968 ; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm0
   1969 ; SKX-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
   1970 ; SKX-NEXT:    retq
   1971 ;
   1972 ; SKX_32-LABEL: test_scatter_16f32:
   1973 ; SKX_32:       # BB#0:
   1974 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1975 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
   1976 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1977 ; SKX_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
   1978 ; SKX_32-NEXT:    retl
   1979   call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
   1980   ret void
   1981 }
   1982 declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
   1983 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
   1984 ; KNL_64-LABEL: test_scatter_16f64:
   1985 ; KNL_64:       # BB#0:
   1986 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1987 ; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
   1988 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1989 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1990 ; KNL_64-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
   1991 ; KNL_64-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
   1992 ; KNL_64-NEXT:    retq
   1993 ;
   1994 ; KNL_32-LABEL: test_scatter_16f64:
   1995 ; KNL_32:       # BB#0:
   1996 ; KNL_32-NEXT:    pushl %ebp
   1997 ; KNL_32-NEXT:  .Ltmp13:
   1998 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1999 ; KNL_32-NEXT:  .Ltmp14:
   2000 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
   2001 ; KNL_32-NEXT:    movl %esp, %ebp
   2002 ; KNL_32-NEXT:  .Ltmp15:
   2003 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
   2004 ; KNL_32-NEXT:    andl $-64, %esp
   2005 ; KNL_32-NEXT:    subl $64, %esp
   2006 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   2007 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
   2008 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   2009 ; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
   2010 ; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
   2011 ; KNL_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
   2012 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   2013 ; KNL_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
   2014 ; KNL_32-NEXT:    movl %ebp, %esp
   2015 ; KNL_32-NEXT:    popl %ebp
   2016 ; KNL_32-NEXT:    retl
   2017 ;
   2018 ; SKX-LABEL: test_scatter_16f64:
   2019 ; SKX:       # BB#0:
   2020 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   2021 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
   2022 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   2023 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   2024 ; SKX-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
   2025 ; SKX-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
   2026 ; SKX-NEXT:    retq
   2027 ;
   2028 ; SKX_32-LABEL: test_scatter_16f64:
   2029 ; SKX_32:       # BB#0:
   2030 ; SKX_32-NEXT:    pushl %ebp
   2031 ; SKX_32-NEXT:  .Ltmp10:
   2032 ; SKX_32-NEXT:    .cfi_def_cfa_offset 8
   2033 ; SKX_32-NEXT:  .Ltmp11:
   2034 ; SKX_32-NEXT:    .cfi_offset %ebp, -8
   2035 ; SKX_32-NEXT:    movl %esp, %ebp
   2036 ; SKX_32-NEXT:  .Ltmp12:
   2037 ; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
   2038 ; SKX_32-NEXT:    andl $-64, %esp
   2039 ; SKX_32-NEXT:    subl $64, %esp
   2040 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   2041 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
   2042 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   2043 ; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
   2044 ; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
   2045 ; SKX_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
   2046 ; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
   2047 ; SKX_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
   2048 ; SKX_32-NEXT:    movl %ebp, %esp
   2049 ; SKX_32-NEXT:    popl %ebp
   2050 ; SKX_32-NEXT:    retl
   2051   call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
   2052   ret void
   2053 }
   2054 declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
   2055