Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
      3 ; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
      4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
      5 ; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
      6 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
      7 
      8 
      9 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
     10 target triple = "x86_64-unknown-linux-gnu"
     11 
     12 
     13 ; SCALAR-LABEL: test1
     14 ; SCALAR:      extractelement <16 x float*>
     15 ; SCALAR-NEXT: load float
     16 ; SCALAR-NEXT: insertelement <16 x float>
     17 ; SCALAR-NEXT: extractelement <16 x float*>
     18 ; SCALAR-NEXT: load float
     19 
     20 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
     21 ; KNL_64-LABEL: test1:
     22 ; KNL_64:       # BB#0:
     23 ; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
     24 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
     25 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
     26 ; KNL_64-NEXT:    retq
     27 ;
     28 ; KNL_32-LABEL: test1:
     29 ; KNL_32:       # BB#0:
     30 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     31 ; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
     32 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
     33 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
     34 ; KNL_32-NEXT:    retl
     35 ;
     36 ; SKX-LABEL: test1:
     37 ; SKX:       # BB#0:
     38 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
     39 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
     40 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
     41 ; SKX-NEXT:    retq
     42 
     43   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
     44   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
     45 
     46   %sext_ind = sext <16 x i32> %ind to <16 x i64>
     47   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
     48 
     49   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
     50   ret <16 x float>%res
     51 }
     52 
     53 declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
     54 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
     55 declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
     56 
     57 
     58 ; SCALAR-LABEL: test2
     59 ; SCALAR:      extractelement <16 x float*>
     60 ; SCALAR-NEXT: load float
     61 ; SCALAR-NEXT: insertelement <16 x float>
     62 ; SCALAR-NEXT: br label %else
     63 ; SCALAR: else:
     64 ; SCALAR-NEXT:  %res.phi.else = phi
     65 ; SCALAR-NEXT:  %Mask1 = extractelement <16 x i1> %imask, i32 1
     66 ; SCALAR-NEXT:  %ToLoad1 = icmp eq i1 %Mask1, true
     67 ; SCALAR-NEXT:  br i1 %ToLoad1, label %cond.load1, label %else2
     68 
     69 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
     70 ; KNL_64-LABEL: test2:
     71 ; KNL_64:       # BB#0:
     72 ; KNL_64-NEXT:    kmovw %esi, %k1
     73 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
     74 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
     75 ; KNL_64-NEXT:    retq
     76 ;
     77 ; KNL_32-LABEL: test2:
     78 ; KNL_32:       # BB#0:
     79 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     80 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
     81 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
     82 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
     83 ; KNL_32-NEXT:    retl
     84 ;
     85 ; SKX-LABEL: test2:
     86 ; SKX:       # BB#0:
     87 ; SKX-NEXT:    kmovw %esi, %k1
     88 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
     89 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
     90 ; SKX-NEXT:    retq
     91 
     92   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
     93   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
     94 
     95   %sext_ind = sext <16 x i32> %ind to <16 x i64>
     96   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
     97   %imask = bitcast i16 %mask to <16 x i1>
     98   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
     99   ret <16 x float> %res
    100 }
    101 
    102 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
    103 ; KNL_64-LABEL: test3:
    104 ; KNL_64:       # BB#0:
    105 ; KNL_64-NEXT:    kmovw %esi, %k1
    106 ; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
    107 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
    108 ; KNL_64-NEXT:    retq
    109 ;
    110 ; KNL_32-LABEL: test3:
    111 ; KNL_32:       # BB#0:
    112 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    113 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    114 ; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
    115 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
    116 ; KNL_32-NEXT:    retl
    117 ;
    118 ; SKX-LABEL: test3:
    119 ; SKX:       # BB#0:
    120 ; SKX-NEXT:    kmovw %esi, %k1
    121 ; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
    122 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    123 ; SKX-NEXT:    retq
    124 
    125   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
    126   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
    127 
    128   %sext_ind = sext <16 x i32> %ind to <16 x i64>
    129   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
    130   %imask = bitcast i16 %mask to <16 x i1>
    131   %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
    132   ret <16 x i32> %res
    133 }
    134 
    135 
    136 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
    137 ; KNL_64-LABEL: test4:
    138 ; KNL_64:       # BB#0:
    139 ; KNL_64-NEXT:    kmovw %esi, %k1
    140 ; KNL_64-NEXT:    kmovw %k1, %k2
    141 ; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
    142 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
    143 ; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
    144 ; KNL_64-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    145 ; KNL_64-NEXT:    retq
    146 ;
    147 ; KNL_32-LABEL: test4:
    148 ; KNL_32:       # BB#0:
    149 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    150 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    151 ; KNL_32-NEXT:    kmovw %k1, %k2
    152 ; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
    153 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
    154 ; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
    155 ; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    156 ; KNL_32-NEXT:    retl
    157 ;
    158 ; SKX-LABEL: test4:
    159 ; SKX:       # BB#0:
    160 ; SKX-NEXT:    kmovw %esi, %k1
    161 ; SKX-NEXT:    kmovw %k1, %k2
    162 ; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
    163 ; SKX-NEXT:    vmovaps %zmm1, %zmm2
    164 ; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
    165 ; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    166 ; SKX-NEXT:    retq
    167 
    168   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
    169   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
    170 
    171   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
    172   %imask = bitcast i16 %mask to <16 x i1>
    173   %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
    174   %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
    175   %res = add <16 x i32> %gt1, %gt2
    176   ret <16 x i32> %res
    177 }
    178 
    179 
    180 ; SCALAR-LABEL: test5
    181 ; SCALAR:        %Mask0 = extractelement <16 x i1> %imask, i32 0
    182 ; SCALAR-NEXT:   %ToStore0 = icmp eq i1 %Mask0, true
    183 ; SCALAR-NEXT:   br i1 %ToStore0, label %cond.store, label %else
    184 ; SCALAR: cond.store:
    185 ; SCALAR-NEXT:  %Elt0 = extractelement <16 x i32> %val, i32 0
    186 ; SCALAR-NEXT:  %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
    187 ; SCALAR-NEXT:  store i32 %Elt0, i32* %Ptr0, align 4
    188 ; SCALAR-NEXT:  br label %else
    189 ; SCALAR: else:
    190 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
    191 ; SCALAR-NEXT:  %ToStore1 = icmp eq i1 %Mask1, true
    192 ; SCALAR-NEXT:  br i1 %ToStore1, label %cond.store1, label %else2
    193 
    194 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
    195 ; KNL_64-LABEL: test5:
    196 ; KNL_64:       # BB#0:
    197 ; KNL_64-NEXT:    kmovw %esi, %k1
    198 ; KNL_64-NEXT:    kmovw %k1, %k2
    199 ; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
    200 ; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
    201 ; KNL_64-NEXT:    retq
    202 ;
    203 ; KNL_32-LABEL: test5:
    204 ; KNL_32:       # BB#0:
    205 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    206 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    207 ; KNL_32-NEXT:    kmovw %k1, %k2
    208 ; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
    209 ; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
    210 ; KNL_32-NEXT:    retl
    211 ;
    212 ; SKX-LABEL: test5:
    213 ; SKX:       # BB#0:
    214 ; SKX-NEXT:    kmovw %esi, %k1
    215 ; SKX-NEXT:    kmovw %k1, %k2
    216 ; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
    217 ; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
    218 ; SKX-NEXT:    retq
    219 
    220   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
    221   %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
    222 
    223   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
    224   %imask = bitcast i16 %mask to <16 x i1>
    225   call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
    226   call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
    227   ret void
    228 }
    229 
    230 declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
    231 declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
    232 
    233 
    234 ; SCALAR-LABEL: test6
    235 ; SCALAR:        store i32 %Elt0, i32* %Ptr01, align 4
    236 ; SCALAR-NEXT:   %Elt1 = extractelement <8 x i32> %a1, i32 1
    237 ; SCALAR-NEXT:   %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
    238 ; SCALAR-NEXT:   store i32 %Elt1, i32* %Ptr12, align 4
    239 ; SCALAR-NEXT:   %Elt2 = extractelement <8 x i32> %a1, i32 2
    240 ; SCALAR-NEXT:   %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
    241 ; SCALAR-NEXT:   store i32 %Elt2, i32* %Ptr23, align 4
    242 
    243 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
    244 ; KNL_64-LABEL: test6:
    245 ; KNL_64:       # BB#0:
    246 ; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
    247 ; KNL_64-NEXT:    kxnorw %k2, %k2, %k2
    248 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
    249 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    250 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
    251 ; KNL_64-NEXT:    retq
    252 ;
    253 ; KNL_32-LABEL: test6:
    254 ; KNL_32:       # BB#0:
    255 ; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
    256 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm2
    257 ; KNL_32-NEXT:    kxnorw %k2, %k2, %k2
    258 ; KNL_32-NEXT:    vpgatherqd (,%zmm2), %ymm1 {%k2}
    259 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm2) {%k1}
    260 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
    261 ; KNL_32-NEXT:    retl
    262 ;
    263 ; SKX-LABEL: test6:
    264 ; SKX:       # BB#0:
    265 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
    266 ; SKX-NEXT:    kxnorw %k2, %k2, %k2
    267 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
    268 ; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    269 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
    270 ; SKX-NEXT:    retq
    271 
    272   %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
    273 
    274   call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
    275   ret <8 x i32>%a
    276 }
    277 
    278 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
    279 ;
    280 ; KNL_64-LABEL: test7:
    281 ; KNL_64:       # BB#0:
    282 ; KNL_64-NEXT:    movzbl %sil, %eax
    283 ; KNL_64-NEXT:    kmovw %eax, %k1
    284 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
    285 ; KNL_64-NEXT:    kmovw %k1, %k2
    286 ; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
    287 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
    288 ; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
    289 ; KNL_64-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
    290 ; KNL_64-NEXT:    retq
    291 ;
    292 ; KNL_32-LABEL: test7:
    293 ; KNL_32:       # BB#0:
    294 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    295 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    296 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
    297 ; KNL_32-NEXT:    kmovw %k1, %k2
    298 ; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
    299 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
    300 ; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
    301 ; KNL_32-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
    302 ; KNL_32-NEXT:    retl
    303 ;
    304 ; SKX-LABEL: test7:
    305 ; SKX:       # BB#0:
    306 ; SKX-NEXT:    kmovb %esi, %k1
    307 ; SKX-NEXT:    kmovw %k1, %k2
    308 ; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
    309 ; SKX-NEXT:    vmovaps %zmm1, %zmm2
    310 ; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
    311 ; SKX-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
    312 ; SKX-NEXT:    retq
    313 
    314   %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
    315   %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
    316 
    317   %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
    318   %imask = bitcast i8 %mask to <8 x i1>
    319   %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
    320   %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
    321   %res = add <8 x i32> %gt1, %gt2
    322   ret <8 x i32> %res
    323 }
    324 
    325 ; No uniform base in this case, index <8 x i64> contains addresses,
    326 ; each gather call will be split into two
    327 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
    328 ; KNL_64-LABEL: test8:
    329 ; KNL_64:       # BB#0:
    330 ; KNL_64-NEXT:    kmovw %edi, %k1
    331 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
    332 ; KNL_64-NEXT:    kmovw %k2, %k3
    333 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
    334 ; KNL_64-NEXT:    kmovw %k1, %k3
    335 ; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
    336 ; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm4
    337 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
    338 ; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
    339 ; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
    340 ; KNL_64-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
    341 ; KNL_64-NEXT:    retq
    342 ;
    343 ; KNL_32-LABEL: test8:
    344 ; KNL_32:       # BB#0:
    345 ; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    346 ; KNL_32-NEXT:    kmovw %k1, %k2
    347 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
    348 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
    349 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
    350 ; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    351 ; KNL_32-NEXT:    retl
    352 ;
    353 ; SKX-LABEL: test8:
    354 ; SKX:       # BB#0:
    355 ; SKX-NEXT:    kmovw %edi, %k1
    356 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
    357 ; SKX-NEXT:    kmovw %k2, %k3
    358 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
    359 ; SKX-NEXT:    kmovw %k1, %k3
    360 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
    361 ; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm4
    362 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
    363 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
    364 ; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
    365 ; SKX-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
    366 ; SKX-NEXT:    retq
    367 ;
    368 ; SKX_32-LABEL: test8:
    369 ; SKX_32:       # BB#0:
    370 ; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    371 ; SKX_32-NEXT:    kmovw %k1, %k2
    372 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
    373 ; SKX_32-NEXT:    vmovaps %zmm1, %zmm2
    374 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
    375 ; SKX_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
    376 ; SKX_32-NEXT:    retl
    377 
    378   %imask = bitcast i16 %mask to <16 x i1>
    379   %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
    380   %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
    381   %res = add <16 x i32> %gt1, %gt2
    382   ret <16 x i32> %res
    383 }
    384 
    385 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
    386 %struct.ST = type { i32, double, %struct.RT }
    387 
    388 ; Masked gather for agregate types
    389 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
    390 
    391 
    392 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
    393 ; KNL_64-LABEL: test9:
    394 ; KNL_64:       # BB#0: # %entry
    395 ; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
    396 ; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
    397 ; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
    398 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
    399 ; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
    400 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
    401 ; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
    402 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
    403 ; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
    404 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
    405 ; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
    406 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
    407 ; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
    408 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
    409 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    410 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    411 ; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
    412 ; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
    413 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    414 ; KNL_64-NEXT:    retq
    415 ;
    416 ; KNL_32-LABEL: test9:
    417 ; KNL_32:       # BB#0: # %entry
    418 ; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
    419 ; KNL_32-NEXT:    vpbroadcastd .LCPI8_0, %ymm3
    420 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
    421 ; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
    422 ; KNL_32-NEXT:    vpbroadcastd .LCPI8_1, %ymm3
    423 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
    424 ; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    425 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    426 ; KNL_32-NEXT:    vpbroadcastd .LCPI8_2, %ymm1
    427 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    428 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
    429 ; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
    430 ; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    431 ; KNL_32-NEXT:    retl
    432 ;
    433 ; SKX-LABEL: test9:
    434 ; SKX:       # BB#0: # %entry
    435 ; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
    436 ; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    437 ; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    438 ; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
    439 ; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    440 ; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    441 ; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
    442 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
    443 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    444 ; SKX-NEXT:    retq
    445 entry:
    446   %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
    447   %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
    448 
    449   %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
    450   %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
    451   ret <8 x i32> %res
    452 }
    453 
    454 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
    455 ; KNL_64-LABEL: test10:
    456 ; KNL_64:       # BB#0: # %entry
    457 ; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
    458 ; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
    459 ; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
    460 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
    461 ; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
    462 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
    463 ; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
    464 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
    465 ; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
    466 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
    467 ; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
    468 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
    469 ; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
    470 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
    471 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    472 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    473 ; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
    474 ; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
    475 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    476 ; KNL_64-NEXT:    retq
    477 ;
    478 ; KNL_32-LABEL: test10:
    479 ; KNL_32:       # BB#0: # %entry
    480 ; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
    481 ; KNL_32-NEXT:    vpbroadcastd .LCPI9_0, %ymm3
    482 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
    483 ; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
    484 ; KNL_32-NEXT:    vpbroadcastd .LCPI9_1, %ymm3
    485 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
    486 ; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    487 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    488 ; KNL_32-NEXT:    vpbroadcastd .LCPI9_2, %ymm1
    489 ; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    490 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
    491 ; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
    492 ; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    493 ; KNL_32-NEXT:    retl
    494 ;
    495 ; SKX-LABEL: test10:
    496 ; SKX:       # BB#0: # %entry
    497 ; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
    498 ; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    499 ; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
    500 ; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
    501 ; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    502 ; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    503 ; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
    504 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
    505 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
    506 ; SKX-NEXT:    retq
    507 entry:
    508   %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
    509   %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
    510 
    511   %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
    512   %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
    513   ret <8 x i32> %res
    514 }
    515 
    516 ; Splat index in GEP, requires broadcast
    517 define <16 x float> @test11(float* %base, i32 %ind) {
    518 ; KNL_64-LABEL: test11:
    519 ; KNL_64:       # BB#0:
    520 ; KNL_64-NEXT:    vpbroadcastd %esi, %zmm1
    521 ; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
    522 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
    523 ; KNL_64-NEXT:    retq
    524 ;
    525 ; KNL_32-LABEL: test11:
    526 ; KNL_32:       # BB#0:
    527 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    528 ; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %zmm1
    529 ; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
    530 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
    531 ; KNL_32-NEXT:    retl
    532 ;
    533 ; SKX-LABEL: test11:
    534 ; SKX:       # BB#0:
    535 ; SKX-NEXT:    vpbroadcastd %esi, %zmm1
    536 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
    537 ; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
    538 ; SKX-NEXT:    retq
    539 
    540   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
    541   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
    542 
    543   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
    544 
    545   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
    546   ret <16 x float>%res
    547 }
    548 
    549 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
    550 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
    551 ; KNL_64-LABEL: test12:
    552 ; KNL_64:       # BB#0:
    553 ; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
    554 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
    555 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
    556 ; KNL_64-NEXT:    retq
    557 ;
    558 ; KNL_32-LABEL: test12:
    559 ; KNL_32:       # BB#0:
    560 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    561 ; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
    562 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
    563 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
    564 ; KNL_32-NEXT:    retl
    565 ;
    566 ; SKX-LABEL: test12:
    567 ; SKX:       # BB#0:
    568 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
    569 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
    570 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    571 ; SKX-NEXT:    retq
    572 
    573   %sext_ind = sext <16 x i32> %ind to <16 x i64>
    574   %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
    575 
    576   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
    577   ret <16 x float>%res
    578 }
    579 
    580 ; The same as the previous, but the mask is undefined
    581 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
    582 ; KNL_64-LABEL: test13:
    583 ; KNL_64:       # BB#0:
    584 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
    585 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
    586 ; KNL_64-NEXT:    retq
    587 ;
    588 ; KNL_32-LABEL: test13:
    589 ; KNL_32:       # BB#0:
    590 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    591 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
    592 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
    593 ; KNL_32-NEXT:    retl
    594 ;
    595 ; SKX-LABEL: test13:
    596 ; SKX:       # BB#0:
    597 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
    598 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    599 ; SKX-NEXT:    retq
    600 
    601   %sext_ind = sext <16 x i32> %ind to <16 x i64>
    602   %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
    603 
    604   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
    605   ret <16 x float>%res
    606 }
    607 
    608 ; The base pointer is not splat, can't find unform base
    609 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
    610 ; KNL_64-LABEL: test14:
    611 ; KNL_64:       # BB#0:
    612 ; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
    613 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
    614 ; KNL_64-NEXT:    vpbroadcastq %xmm0, %zmm0
    615 ; KNL_64-NEXT:    vmovd %esi, %xmm1
    616 ; KNL_64-NEXT:    vpbroadcastd %xmm1, %ymm1
    617 ; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
    618 ; KNL_64-NEXT:    vpsllq $2, %zmm1, %zmm1
    619 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    620 ; KNL_64-NEXT:    kshiftrw $8, %k0, %k1
    621 ; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
    622 ; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
    623 ; KNL_64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
    624 ; KNL_64-NEXT:    retq
    625 ;
    626 ; KNL_32-LABEL: test14:
    627 ; KNL_32:       # BB#0:
    628 ; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
    629 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
    630 ; KNL_32-NEXT:    vpbroadcastd %xmm0, %zmm0
    631 ; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
    632 ; KNL_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
    633 ; KNL_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
    634 ; KNL_32-NEXT:    retl
    635 ;
    636 ; SKX-LABEL: test14:
    637 ; SKX:       # BB#0:
    638 ; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
    639 ; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
    640 ; SKX-NEXT:    vpbroadcastq %xmm0, %zmm0
    641 ; SKX-NEXT:    vmovd %esi, %xmm1
    642 ; SKX-NEXT:    vpbroadcastd %xmm1, %ymm1
    643 ; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
    644 ; SKX-NEXT:    vpsllq $2, %zmm1, %zmm1
    645 ; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    646 ; SKX-NEXT:    kshiftrw $8, %k0, %k1
    647 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
    648 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
    649 ; SKX-NEXT:    vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
    650 ; SKX-NEXT:    retq
    651 ;
    652 ; SKX_32-LABEL: test14:
    653 ; SKX_32:       # BB#0:
    654 ; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
    655 ; SKX_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
    656 ; SKX_32-NEXT:    vpbroadcastd %xmm0, %zmm0
    657 ; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
    658 ; SKX_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
    659 ; SKX_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
    660 ; SKX_32-NEXT:    retl
    661 
    662   %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
    663   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
    664 
    665   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
    666 
    667   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
    668   ret <16 x float>%res
    669 }
    670 
    671 declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
    672 declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
    673 declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
    674 
    675 ; Gather smaller than existing instruction
    676 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
    677 ;
    678 ; KNL_64-LABEL: test15:
    679 ; KNL_64:       # BB#0:
    680 ; KNL_64-NEXT:    vpxor %ymm2, %ymm2, %ymm2
    681 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
    682 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm2
    683 ; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm0
    684 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    685 ; KNL_64-NEXT:    vptestmq %zmm0, %zmm0, %k1
    686 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
    687 ; KNL_64-NEXT:    retq
    688 ;
    689 ; KNL_32-LABEL: test15:
    690 ; KNL_32:       # BB#0:
    691 ; KNL_32-NEXT:    vpxor %ymm2, %ymm2, %ymm2
    692 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
    693 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    694 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm2
    695 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm0
    696 ; KNL_32-NEXT:    vpandq .LCPI14_0, %zmm0, %zmm0
    697 ; KNL_32-NEXT:    vptestmq %zmm0, %zmm0, %k1
    698 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
    699 ; KNL_32-NEXT:    retl
    700 ;
    701 ; SKX-LABEL: test15:
    702 ; SKX:       # BB#0:
    703 ; SKX-NEXT:    vpmovd2m %xmm1, %k1
    704 ; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
    705 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    706 ; SKX-NEXT:    retq
    707 
    708   %sext_ind = sext <4 x i32> %ind to <4 x i64>
    709   %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
    710   %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
    711   ret <4 x float>%res
    712 }
    713 
    714 ; Gather smaller than existing instruction
    715 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
    716 ;
    717 ; KNL_64-LABEL: test16:
    718 ; KNL_64:       # BB#0:
    719 ; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
    720 ; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
    721 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
    722 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    723 ; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
    724 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
    725 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    726 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
    727 ; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
    728 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
    729 ; KNL_64-NEXT:    retq
    730 ;
    731 ; KNL_32-LABEL: test16:
    732 ; KNL_32:       # BB#0:
    733 ; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
    734 ; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
    735 ; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
    736 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    737 ; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
    738 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    739 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
    740 ; KNL_32-NEXT:    vpandq .LCPI15_0, %zmm1, %zmm1
    741 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
    742 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
    743 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
    744 ; KNL_32-NEXT:    retl
    745 ;
    746 ; SKX-LABEL: test16:
    747 ; SKX:       # BB#0:
    748 ; SKX-NEXT:    vpmovd2m %xmm1, %k1
    749 ; SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
    750 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
    751 ; SKX-NEXT:    retq
    752 
    753   %sext_ind = sext <4 x i32> %ind to <4 x i64>
    754   %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
    755   %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
    756   ret <4 x double>%res
    757 }
    758 
    759 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
    760 ;
    761 ; KNL_64-LABEL: test17:
    762 ; KNL_64:       # BB#0:
    763 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    764 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
    765 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    766 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
    767 ; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
    768 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
    769 ; KNL_64-NEXT:    retq
    770 ;
    771 ; KNL_32-LABEL: test17:
    772 ; KNL_32:       # BB#0:
    773 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    774 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
    775 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    776 ; KNL_32-NEXT:    vpandq .LCPI16_0, %zmm1, %zmm1
    777 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
    778 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
    779 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
    780 ; KNL_32-NEXT:    retl
    781 ;
    782 ; SKX-LABEL: test17:
    783 ; SKX:       # BB#0:
    784 ; SKX-NEXT:    vpmovq2m %xmm1, %k1
    785 ; SKX-NEXT:    vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
    786 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
    787 ; SKX-NEXT:    retq
    788 
    789   %sext_ind = sext <2 x i32> %ind to <2 x i64>
    790   %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
    791   %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
    792   ret <2 x double>%res
    793 }
    794 
    795 declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
    796 declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
    797 declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
    798 declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
    799 declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
    800 
    801 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
    802 ;
    803 ; KNL_64-LABEL: test18:
    804 ; KNL_64:       # BB#0:
    805 ; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
    806 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
    807 ; KNL_64-NEXT:    vpmovsxdq %ymm2, %zmm2
    808 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
    809 ; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
    810 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    811 ; KNL_64-NEXT:    retq
    812 ;
    813 ; KNL_32-LABEL: test18:
    814 ; KNL_32:       # BB#0:
    815 ; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
    816 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
    817 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
    818 ; KNL_32-NEXT:    vpmovsxdq %ymm2, %zmm2
    819 ; KNL_32-NEXT:    vpandq .LCPI17_0, %zmm2, %zmm2
    820 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
    821 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    822 ; KNL_32-NEXT:    retl
    823 ;
    824 ; SKX-LABEL: test18:
    825 ; SKX:       # BB#0:
    826 ; SKX-NEXT:    vpmovd2m %xmm2, %k1
    827 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
    828 ; SKX-NEXT:    retq
    829   call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
    830   ret void
    831 }
    832 
    833 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
    834 ;
    835 ; KNL_64-LABEL: test19:
    836 ; KNL_64:       # BB#0:
    837 ; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
    838 ; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
    839 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
    840 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    841 ; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
    842 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    843 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
    844 ; KNL_64-NEXT:    vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
    845 ; KNL_64-NEXT:    retq
    846 ;
    847 ; KNL_32-LABEL: test19:
    848 ; KNL_32:       # BB#0:
    849 ; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
    850 ; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
    851 ; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
    852 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    853 ; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
    854 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    855 ; KNL_32-NEXT:    vpandq .LCPI18_0, %zmm1, %zmm1
    856 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
    857 ; KNL_32-NEXT:    vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
    858 ; KNL_32-NEXT:    retl
    859 ;
    860 ; SKX-LABEL: test19:
    861 ; SKX:       # BB#0:
    862 ; SKX-NEXT:    vpmovd2m %xmm1, %k1
    863 ; SKX-NEXT:    vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
    864 ; SKX-NEXT:    retq
    865 ;
    866 ; SKX_32-LABEL: test19:
    867 ; SKX_32:       # BB#0:
    868 ; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
    869 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    870 ; SKX_32-NEXT:    vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
    871 ; SKX_32-NEXT:    retl
    872   %gep = getelementptr double, double* %ptr, <4 x i64> %ind
    873   call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
    874   ret void
    875 }
    876 
    877 ; Data type requires widening
    878 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
    879 ;
    880 ; KNL_64-LABEL: test20:
    881 ; KNL_64:       # BB#0:
    882 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    883 ; KNL_64-NEXT:    vmovq %xmm2, %xmm2
    884 ; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
    885 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
    886 ; KNL_64-NEXT:    vpmovsxdq %ymm2, %zmm2
    887 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
    888 ; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
    889 ; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
    890 ; KNL_64-NEXT:    retq
    891 ;
    892 ; KNL_32-LABEL: test20:
    893 ; KNL_32:       # BB#0:
    894 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    895 ; KNL_32-NEXT:    vmovq %xmm2, %xmm2
    896 ; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
    897 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
    898 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    899 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
    900 ; KNL_32-NEXT:    vpmovsxdq %ymm2, %zmm2
    901 ; KNL_32-NEXT:    vpandq .LCPI19_0, %zmm2, %zmm2
    902 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
    903 ; KNL_32-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
    904 ; KNL_32-NEXT:    retl
    905 ;
    906 ; SKX-LABEL: test20:
    907 ; SKX:       # BB#0:
    908 ; SKX-NEXT:    vpmovq2m %xmm2, %k0
    909 ; SKX-NEXT:    kshiftlw $2, %k0, %k0
    910 ; SKX-NEXT:    kshiftrw $2, %k0, %k1
    911 ; SKX-NEXT:    vscatterqps %xmm0, (,%ymm1) {%k1}
    912 ; SKX-NEXT:    retq
    913   call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
    914   ret void
    915 }
    916 
    917 ; Data type requires promotion
    918 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
    919 ;
    920 ; KNL_64-LABEL: test21:
    921 ; KNL_64:       # BB#0:
    922 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    923 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
    924 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    925 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
    926 ; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
    927 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    928 ; KNL_64-NEXT:    retq
    929 ;
    930 ; KNL_32-LABEL: test21:
    931 ; KNL_32:       # BB#0:
    932 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
    933 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
    934 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    935 ; KNL_32-NEXT:    vpandq .LCPI20_0, %zmm2, %zmm2
    936 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
    937 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
    938 ; KNL_32-NEXT:    retl
    939 ;
    940 ; SKX-LABEL: test21:
    941 ; SKX:       # BB#0:
    942 ; SKX-NEXT:    vpmovq2m %xmm2, %k0
    943 ; SKX-NEXT:    kshiftlw $2, %k0, %k0
    944 ; SKX-NEXT:    kshiftrw $2, %k0, %k1
    945 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    946 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
    947 ; SKX-NEXT:    retq
    948   call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
    949   ret void
    950 }
    951 
    952 ; The result type requires widening
    953 declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
    954 
    955 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
    956 ;
    957 ;
    958 ; KNL_64-LABEL: test22:
    959 ; KNL_64:       # BB#0:
    960 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    961 ; KNL_64-NEXT:    vmovq %xmm1, %xmm1
    962 ; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
    963 ; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
    964 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    965 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
    966 ; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
    967 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    968 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
    969 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
    970 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
    971 ; KNL_64-NEXT:    retq
    972 ;
    973 ; KNL_32-LABEL: test22:
    974 ; KNL_32:       # BB#0:
    975 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    976 ; KNL_32-NEXT:    vmovq %xmm1, %xmm1
    977 ; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
    978 ; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
    979 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    980 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    981 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
    982 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
    983 ; KNL_32-NEXT:    vpandq .LCPI21_0, %zmm1, %zmm1
    984 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
    985 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
    986 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
    987 ; KNL_32-NEXT:    retl
    988 ;
    989 ; SKX-LABEL: test22:
    990 ; SKX:       # BB#0:
    991 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    992 ; SKX-NEXT:    vpmovq2m %xmm1, %k0
    993 ; SKX-NEXT:    kshiftlw $2, %k0, %k0
    994 ; SKX-NEXT:    kshiftrw $2, %k0, %k1
    995 ; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
    996 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
    997 ; SKX-NEXT:    retq
    998   %sext_ind = sext <2 x i32> %ind to <2 x i64>
    999   %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
   1000   %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
   1001   ret <2 x float>%res
   1002 }
   1003 
   1004 declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
   1005 declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
   1006 
   1007 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
   1008 ;
   1009 ; KNL_64-LABEL: test23:
   1010 ; KNL_64:       # BB#0:
   1011 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
   1012 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
   1013 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
   1014 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1015 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
   1016 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
   1017 ; KNL_64-NEXT:    retq
   1018 ;
   1019 ; KNL_32-LABEL: test23:
   1020 ; KNL_32:       # BB#0:
   1021 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
   1022 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
   1023 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1024 ; KNL_32-NEXT:    vpandq .LCPI22_0, %zmm1, %zmm1
   1025 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1026 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
   1027 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1028 ; KNL_32-NEXT:    retl
   1029 ;
   1030 ; SKX-LABEL: test23:
   1031 ; SKX:       # BB#0:
   1032 ; SKX-NEXT:    vpmovq2m %xmm1, %k1
   1033 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
   1034 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
   1035 ; SKX-NEXT:    retq
   1036   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1037   %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
   1038   %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
   1039   ret <2 x i32>%res
   1040 }
   1041 
   1042 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
   1043 ;
   1044 ;
   1045 ; KNL_64-LABEL: test24:
   1046 ; KNL_64:       # BB#0:
   1047 ; KNL_64-NEXT:    movb $3, %al
   1048 ; KNL_64-NEXT:    movzbl %al, %eax
   1049 ; KNL_64-NEXT:    kmovw %eax, %k1
   1050 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
   1051 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
   1052 ; KNL_64-NEXT:    retq
   1053 ;
   1054 ; KNL_32-LABEL: test24:
   1055 ; KNL_32:       # BB#0:
   1056 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1057 ; KNL_32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
   1058 ; KNL_32-NEXT:    vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
   1059 ; KNL_32-NEXT:    vpandq .LCPI23_1, %zmm1, %zmm1
   1060 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1061 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
   1062 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
   1063 ; KNL_32-NEXT:    retl
   1064 ;
   1065 ; SKX-LABEL: test24:
   1066 ; SKX:       # BB#0:
   1067 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
   1068 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
   1069 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
   1070 ; SKX-NEXT:    retq
   1071   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1072   %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
   1073   %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
   1074   ret <2 x i32>%res
   1075 }
   1076 
   1077 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
   1078 ;
   1079 ; KNL_64-LABEL: test25:
   1080 ; KNL_64:       # BB#0:
   1081 ; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
   1082 ; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
   1083 ; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
   1084 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1085 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
   1086 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
   1087 ; KNL_64-NEXT:    retq
   1088 ;
   1089 ; KNL_32-LABEL: test25:
   1090 ; KNL_32:       # BB#0:
   1091 ; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
   1092 ; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
   1093 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1094 ; KNL_32-NEXT:    vpandq .LCPI24_0, %zmm1, %zmm1
   1095 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
   1096 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
   1097 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1098 ; KNL_32-NEXT:    retl
   1099 ;
   1100 ; SKX-LABEL: test25:
   1101 ; SKX:       # BB#0:
   1102 ; SKX-NEXT:    vpmovq2m %xmm1, %k1
   1103 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
   1104 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
   1105 ; SKX-NEXT:    retq
   1106   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1107   %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
   1108   %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
   1109   ret <2 x i64>%res
   1110 }
   1111 
   1112 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
   1113 ;
   1114 ; KNL_64-LABEL: test26:
   1115 ; KNL_64:       # BB#0:
   1116 ; KNL_64-NEXT:    movb $3, %al
   1117 ; KNL_64-NEXT:    movzbl %al, %eax
   1118 ; KNL_64-NEXT:    kmovw %eax, %k1
   1119 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
   1120 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
   1121 ; KNL_64-NEXT:    retq
   1122 ;
   1123 ; KNL_32-LABEL: test26:
   1124 ; KNL_32:       # BB#0:
   1125 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1126 ; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
   1127 ; KNL_32-NEXT:    vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
   1128 ; KNL_32-NEXT:    vpandq .LCPI25_1, %zmm2, %zmm2
   1129 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
   1130 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
   1131 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
   1132 ; KNL_32-NEXT:    retl
   1133 ;
   1134 ; SKX-LABEL: test26:
   1135 ; SKX:       # BB#0:
   1136 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
   1137 ; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
   1138 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
   1139 ; SKX-NEXT:    retq
   1140   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1141   %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
   1142   %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
   1143   ret <2 x i64>%res
   1144 }
   1145 
   1146 ; Result type requires widening; all-ones mask
   1147 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
   1148 ;
   1149 ; KNL_64-LABEL: test27:
   1150 ; KNL_64:       # BB#0:
   1151 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1152 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
   1153 ; KNL_64-NEXT:    movb $3, %al
   1154 ; KNL_64-NEXT:    movzbl %al, %eax
   1155 ; KNL_64-NEXT:    kmovw %eax, %k1
   1156 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
   1157 ; KNL_64-NEXT:    retq
   1158 ;
   1159 ; KNL_32-LABEL: test27:
   1160 ; KNL_32:       # BB#0:
   1161 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1162 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1163 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
   1164 ; KNL_32-NEXT:    movb $3, %cl
   1165 ; KNL_32-NEXT:    movzbl %cl, %ecx
   1166 ; KNL_32-NEXT:    kmovw %ecx, %k1
   1167 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
   1168 ; KNL_32-NEXT:    retl
   1169 ;
   1170 ; SKX-LABEL: test27:
   1171 ; SKX:       # BB#0:
   1172 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
   1173 ; SKX-NEXT:    movb $3, %al
   1174 ; SKX-NEXT:    kmovb %eax, %k1
   1175 ; SKX-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
   1176 ; SKX-NEXT:    retq
   1177   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   1178   %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
   1179   %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
   1180   ret <2 x float>%res
   1181 }
   1182 
   1183 ; Data type requires promotion, mask is all-ones
   1184 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
   1185 ;
   1186 ;
   1187 ; KNL_64-LABEL: test28:
   1188 ; KNL_64:       # BB#0:
   1189 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1190 ; KNL_64-NEXT:    movb $3, %al
   1191 ; KNL_64-NEXT:    movzbl %al, %eax
   1192 ; KNL_64-NEXT:    kmovw %eax, %k1
   1193 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
   1194 ; KNL_64-NEXT:    retq
   1195 ;
   1196 ; KNL_32-LABEL: test28:
   1197 ; KNL_32:       # BB#0:
   1198 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1199 ; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
   1200 ; KNL_32-NEXT:    vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
   1201 ; KNL_32-NEXT:    vpandq .LCPI27_1, %zmm2, %zmm2
   1202 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
   1203 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
   1204 ; KNL_32-NEXT:    retl
   1205 ;
   1206 ; SKX-LABEL: test28:
   1207 ; SKX:       # BB#0:
   1208 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1209 ; SKX-NEXT:    movb $3, %al
   1210 ; SKX-NEXT:    kmovb %eax, %k1
   1211 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
   1212 ; SKX-NEXT:    retq
   1213   call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
   1214   ret void
   1215 }
   1216 
   1217 
   1218 ; SCALAR-LABEL: test29
   1219 ; SCALAR:      extractelement <16 x float*>
   1220 ; SCALAR-NEXT: load float
   1221 ; SCALAR-NEXT: insertelement <16 x float>
   1222 ; SCALAR-NEXT: extractelement <16 x float*>
   1223 ; SCALAR-NEXT: load float
   1224 
   1225 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
   1226 ; KNL_64-LABEL: test29:
   1227 ; KNL_64:       # BB#0:
   1228 ; KNL_64-NEXT:    movw $44, %ax
   1229 ; KNL_64-NEXT:    kmovw %eax, %k1
   1230 ; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
   1231 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
   1232 ; KNL_64-NEXT:    retq
   1233 ;
   1234 ; KNL_32-LABEL: test29:
   1235 ; KNL_32:       # BB#0:
   1236 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1237 ; KNL_32-NEXT:    movw $44, %cx
   1238 ; KNL_32-NEXT:    kmovw %ecx, %k1
   1239 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
   1240 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
   1241 ; KNL_32-NEXT:    retl
   1242 ;
   1243 ; SKX-LABEL: test29:
   1244 ; SKX:       # BB#0:
   1245 ; SKX-NEXT:    movw $44, %ax
   1246 ; SKX-NEXT:    kmovw %eax, %k1
   1247 ; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
   1248 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
   1249 ; SKX-NEXT:    retq
   1250 
   1251   %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
   1252   %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
   1253 
   1254   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   1255   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
   1256 
   1257   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
   1258   ret <16 x float>%res
   1259 }
   1260 
   1261 ; Check non-power-of-2 case. It should be scalarized.
   1262 declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
   1263 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
   1264 ; KNL_64-LABEL: test30:
   1265 ; KNL_64:       # BB#0:
   1266 ; KNL_64-NEXT:    andl $1, %edx
   1267 ; KNL_64-NEXT:    kmovw %edx, %k1
   1268 ; KNL_64-NEXT:    andl $1, %esi
   1269 ; KNL_64-NEXT:    kmovw %esi, %k2
   1270 ; KNL_64-NEXT:    movl %edi, %eax
   1271 ; KNL_64-NEXT:    andl $1, %eax
   1272 ; KNL_64-NEXT:    kmovw %eax, %k0
   1273 ; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
   1274 ; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
   1275 ; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
   1276 ; KNL_64-NEXT:    # implicit-def: %XMM0
   1277 ; KNL_64-NEXT:    testb $1, %dil
   1278 ; KNL_64-NEXT:    je .LBB29_2
   1279 ; KNL_64-NEXT:  # BB#1: # %cond.load
   1280 ; KNL_64-NEXT:    vmovq %xmm1, %rax
   1281 ; KNL_64-NEXT:    vmovd (%rax), %xmm0
   1282 ; KNL_64-NEXT:  .LBB29_2: # %else
   1283 ; KNL_64-NEXT:    kmovw %k2, %eax
   1284 ; KNL_64-NEXT:    movl %eax, %ecx
   1285 ; KNL_64-NEXT:    andl $1, %ecx
   1286 ; KNL_64-NEXT:    testb %cl, %cl
   1287 ; KNL_64-NEXT:    je .LBB29_4
   1288 ; KNL_64-NEXT:  # BB#3: # %cond.load1
   1289 ; KNL_64-NEXT:    vpextrq $1, %xmm1, %rcx
   1290 ; KNL_64-NEXT:    vpinsrd $1, (%rcx), %xmm0, %xmm0
   1291 ; KNL_64-NEXT:  .LBB29_4: # %else2
   1292 ; KNL_64-NEXT:    kmovw %k1, %ecx
   1293 ; KNL_64-NEXT:    movl %ecx, %edx
   1294 ; KNL_64-NEXT:    andl $1, %edx
   1295 ; KNL_64-NEXT:    testb %dl, %dl
   1296 ; KNL_64-NEXT:    je .LBB29_6
   1297 ; KNL_64-NEXT:  # BB#5: # %cond.load4
   1298 ; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
   1299 ; KNL_64-NEXT:    vmovq %xmm1, %rdx
   1300 ; KNL_64-NEXT:    vpinsrd $2, (%rdx), %xmm0, %xmm0
   1301 ; KNL_64-NEXT:  .LBB29_6: # %else5
   1302 ; KNL_64-NEXT:    kmovw %k0, %edx
   1303 ; KNL_64-NEXT:    vmovd %edx, %xmm1
   1304 ; KNL_64-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
   1305 ; KNL_64-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
   1306 ; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
   1307 ; KNL_64-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
   1308 ; KNL_64-NEXT:    retq
   1309 ;
   1310 ; KNL_32-LABEL: test30:
   1311 ; KNL_32:       # BB#0:
   1312 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1313 ; KNL_32-NEXT:    andl $1, %eax
   1314 ; KNL_32-NEXT:    kmovw %eax, %k1
   1315 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1316 ; KNL_32-NEXT:    andl $1, %eax
   1317 ; KNL_32-NEXT:    kmovw %eax, %k2
   1318 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1319 ; KNL_32-NEXT:    movl %eax, %ecx
   1320 ; KNL_32-NEXT:    andl $1, %ecx
   1321 ; KNL_32-NEXT:    kmovw %ecx, %k0
   1322 ; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
   1323 ; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
   1324 ; KNL_32-NEXT:    # implicit-def: %XMM0
   1325 ; KNL_32-NEXT:    testb $1, %al
   1326 ; KNL_32-NEXT:    je .LBB29_2
   1327 ; KNL_32-NEXT:  # BB#1: # %cond.load
   1328 ; KNL_32-NEXT:    vmovd %xmm1, %eax
   1329 ; KNL_32-NEXT:    vmovd (%eax), %xmm0
   1330 ; KNL_32-NEXT:  .LBB29_2: # %else
   1331 ; KNL_32-NEXT:    kmovw %k2, %eax
   1332 ; KNL_32-NEXT:    movl %eax, %ecx
   1333 ; KNL_32-NEXT:    andl $1, %ecx
   1334 ; KNL_32-NEXT:    testb %cl, %cl
   1335 ; KNL_32-NEXT:    je .LBB29_4
   1336 ; KNL_32-NEXT:  # BB#3: # %cond.load1
   1337 ; KNL_32-NEXT:    vpextrd $1, %xmm1, %ecx
   1338 ; KNL_32-NEXT:    vpinsrd $1, (%ecx), %xmm0, %xmm0
   1339 ; KNL_32-NEXT:  .LBB29_4: # %else2
   1340 ; KNL_32-NEXT:    kmovw %k1, %ecx
   1341 ; KNL_32-NEXT:    movl %ecx, %edx
   1342 ; KNL_32-NEXT:    andl $1, %edx
   1343 ; KNL_32-NEXT:    testb %dl, %dl
   1344 ; KNL_32-NEXT:    je .LBB29_6
   1345 ; KNL_32-NEXT:  # BB#5: # %cond.load4
   1346 ; KNL_32-NEXT:    vpextrd $2, %xmm1, %edx
   1347 ; KNL_32-NEXT:    vpinsrd $2, (%edx), %xmm0, %xmm0
   1348 ; KNL_32-NEXT:  .LBB29_6: # %else5
   1349 ; KNL_32-NEXT:    kmovw %k0, %edx
   1350 ; KNL_32-NEXT:    vmovd %edx, %xmm1
   1351 ; KNL_32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
   1352 ; KNL_32-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
   1353 ; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
   1354 ; KNL_32-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
   1355 ; KNL_32-NEXT:    retl
   1356 ;
   1357 ; SKX-LABEL: test30:
   1358 ; SKX:       # BB#0:
   1359 ; SKX-NEXT:    vpmovd2m %xmm2, %k1
   1360 ; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
   1361 ; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
   1362 ; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
   1363 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
   1364 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
   1365 ; SKX-NEXT:    # implicit-def: %XMM0
   1366 ; SKX-NEXT:    andb $1, %al
   1367 ; SKX-NEXT:    je .LBB29_2
   1368 ; SKX-NEXT:  # BB#1: # %cond.load
   1369 ; SKX-NEXT:    vmovq %xmm1, %rax
   1370 ; SKX-NEXT:    vmovd (%rax), %xmm0
   1371 ; SKX-NEXT:  .LBB29_2: # %else
   1372 ; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
   1373 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
   1374 ; SKX-NEXT:    andb $1, %al
   1375 ; SKX-NEXT:    je .LBB29_4
   1376 ; SKX-NEXT:  # BB#3: # %cond.load1
   1377 ; SKX-NEXT:    vpextrq $1, %xmm1, %rax
   1378 ; SKX-NEXT:    vpinsrd $1, (%rax), %xmm0, %xmm0
   1379 ; SKX-NEXT:  .LBB29_4: # %else2
   1380 ; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
   1381 ; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
   1382 ; SKX-NEXT:    andb $1, %al
   1383 ; SKX-NEXT:    je .LBB29_6
   1384 ; SKX-NEXT:  # BB#5: # %cond.load4
   1385 ; SKX-NEXT:    vextracti128 $1, %ymm1, %xmm1
   1386 ; SKX-NEXT:    vmovq %xmm1, %rax
   1387 ; SKX-NEXT:    vpinsrd $2, (%rax), %xmm0, %xmm0
   1388 ; SKX-NEXT:  .LBB29_6: # %else5
   1389 ; SKX-NEXT:    vmovdqa32 %xmm0, %xmm3 {%k1}
   1390 ; SKX-NEXT:    vmovaps %zmm3, %zmm0
   1391 ; SKX-NEXT:    retq
   1392 
   1393   %sext_ind = sext <3 x i32> %ind to <3 x i64>
   1394   %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
   1395   %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
   1396   ret <3 x i32>%res
   1397 }
   1398 
   1399 declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
   1400 
   1401 ; KNL-LABEL: test31
   1402 ; KNL: vpgatherqq
   1403 ; KNL: vpgatherqq
   1404 define <16 x float*> @test31(<16 x float**> %ptrs) {
   1405 ; KNL_64-LABEL: test31:
   1406 ; KNL_64:       # BB#0:
   1407 ; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
   1408 ; KNL_64-NEXT:    kxnorw %k2, %k2, %k2
   1409 ; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
   1410 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k1
   1411 ; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
   1412 ; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
   1413 ; KNL_64-NEXT:    vmovaps %zmm3, %zmm1
   1414 ; KNL_64-NEXT:    retq
   1415 ;
   1416 ; KNL_32-LABEL: test31:
   1417 ; KNL_32:       # BB#0:
   1418 ; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
   1419 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
   1420 ; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
   1421 ; KNL_32-NEXT:    retl
   1422 ;
   1423 ; SKX-LABEL: test31:
   1424 ; SKX:       # BB#0:
   1425 ; SKX-NEXT:    kxnorw %k1, %k1, %k1
   1426 ; SKX-NEXT:    kxnorw %k2, %k2, %k2
   1427 ; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
   1428 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
   1429 ; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
   1430 ; SKX-NEXT:    vmovaps %zmm2, %zmm0
   1431 ; SKX-NEXT:    vmovaps %zmm3, %zmm1
   1432 ; SKX-NEXT:    retq
   1433 ;
   1434 ; SKX_32-LABEL: test31:
   1435 ; SKX_32:       # BB#0:
   1436 ; SKX_32-NEXT:    kxnorw %k1, %k1, %k1
   1437 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
   1438 ; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
   1439 ; SKX_32-NEXT:    retl
   1440 
   1441   %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
   1442   ret <16 x float*>%res
   1443 }
   1444 
   1445 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
   1446 ; KNL_64-LABEL: test_gather_16i32:
   1447 ; KNL_64:       # BB#0:
   1448 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1449 ; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1450 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1451 ; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
   1452 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1453 ; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
   1454 ; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
   1455 ; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
   1456 ; KNL_64-NEXT:    retq
   1457 ;
   1458 ; KNL_32-LABEL: test_gather_16i32:
   1459 ; KNL_32:       # BB#0:
   1460 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1461 ; KNL_32-NEXT:    vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
   1462 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1463 ; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
   1464 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1465 ; KNL_32-NEXT:    retl
   1466 ;
   1467 ; SKX-LABEL: test_gather_16i32:
   1468 ; SKX:       # BB#0:
   1469 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1470 ; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1471 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1472 ; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm2
   1473 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1474 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
   1475 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
   1476 ; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
   1477 ; SKX-NEXT:    retq
   1478 ;
   1479 ; SKX_32-LABEL: test_gather_16i32:
   1480 ; SKX_32:       # BB#0:
   1481 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1482 ; SKX_32-NEXT:    vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
   1483 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1484 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
   1485 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
   1486 ; SKX_32-NEXT:    retl
   1487   %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
   1488   ret <16 x i32> %res
   1489 }
   1490 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
   1491 ; KNL_64-LABEL: test_gather_16i64:
   1492 ; KNL_64:       # BB#0:
   1493 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1494 ; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1495 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1496 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1497 ; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
   1498 ; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
   1499 ; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
   1500 ; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
   1501 ; KNL_64-NEXT:    retq
   1502 ;
   1503 ; KNL_32-LABEL: test_gather_16i64:
   1504 ; KNL_32:       # BB#0:
   1505 ; KNL_32-NEXT:    pushl %ebp
   1506 ; KNL_32-NEXT:  .Ltmp0:
   1507 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1508 ; KNL_32-NEXT:  .Ltmp1:
   1509 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
   1510 ; KNL_32-NEXT:    movl %esp, %ebp
   1511 ; KNL_32-NEXT:  .Ltmp2:
   1512 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
   1513 ; KNL_32-NEXT:    andl $-64, %esp
   1514 ; KNL_32-NEXT:    subl $64, %esp
   1515 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1516 ; KNL_32-NEXT:    vpandd .LCPI32_0{1to16}, %zmm1, %zmm1
   1517 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1518 ; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
   1519 ; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
   1520 ; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
   1521 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1522 ; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
   1523 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1524 ; KNL_32-NEXT:    movl %ebp, %esp
   1525 ; KNL_32-NEXT:    popl %ebp
   1526 ; KNL_32-NEXT:    retl
   1527 ;
   1528 ; SKX-LABEL: test_gather_16i64:
   1529 ; SKX:       # BB#0:
   1530 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1531 ; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1532 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1533 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1534 ; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
   1535 ; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
   1536 ; SKX-NEXT:    vmovaps %zmm3, %zmm0
   1537 ; SKX-NEXT:    vmovaps %zmm4, %zmm1
   1538 ; SKX-NEXT:    retq
   1539   %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
   1540   ret <16 x i64> %res
   1541 }
   1542 declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
   1543 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
   1544 ; KNL_64-LABEL: test_gather_16f32:
   1545 ; KNL_64:       # BB#0:
   1546 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1547 ; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1548 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1549 ; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
   1550 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1551 ; KNL_64-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
   1552 ; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
   1553 ; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
   1554 ; KNL_64-NEXT:    retq
   1555 ;
   1556 ; KNL_32-LABEL: test_gather_16f32:
   1557 ; KNL_32:       # BB#0:
   1558 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1559 ; KNL_32-NEXT:    vpandd .LCPI33_0{1to16}, %zmm1, %zmm1
   1560 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1561 ; KNL_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
   1562 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1563 ; KNL_32-NEXT:    retl
   1564 ;
   1565 ; SKX-LABEL: test_gather_16f32:
   1566 ; SKX:       # BB#0:
   1567 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1568 ; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1569 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1570 ; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm2
   1571 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1572 ; SKX-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
   1573 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
   1574 ; SKX-NEXT:    vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
   1575 ; SKX-NEXT:    retq
   1576   %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
   1577   ret <16 x float> %res
   1578 }
   1579 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
   1580 ; KNL_64-LABEL: test_gather_16f64:
   1581 ; KNL_64:       # BB#0:
   1582 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1583 ; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1584 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1585 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1586 ; KNL_64-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
   1587 ; KNL_64-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
   1588 ; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
   1589 ; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
   1590 ; KNL_64-NEXT:    retq
   1591 ;
   1592 ; KNL_32-LABEL: test_gather_16f64:
   1593 ; KNL_32:       # BB#0:
   1594 ; KNL_32-NEXT:    pushl %ebp
   1595 ; KNL_32-NEXT:  .Ltmp3:
   1596 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1597 ; KNL_32-NEXT:  .Ltmp4:
   1598 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
   1599 ; KNL_32-NEXT:    movl %esp, %ebp
   1600 ; KNL_32-NEXT:  .Ltmp5:
   1601 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
   1602 ; KNL_32-NEXT:    andl $-64, %esp
   1603 ; KNL_32-NEXT:    subl $64, %esp
   1604 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1605 ; KNL_32-NEXT:    vpandd .LCPI34_0{1to16}, %zmm1, %zmm1
   1606 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1607 ; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
   1608 ; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
   1609 ; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
   1610 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1611 ; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
   1612 ; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
   1613 ; KNL_32-NEXT:    movl %ebp, %esp
   1614 ; KNL_32-NEXT:    popl %ebp
   1615 ; KNL_32-NEXT:    retl
   1616 ;
   1617 ; SKX-LABEL: test_gather_16f64:
   1618 ; SKX:       # BB#0:
   1619 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1620 ; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1621 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1622 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1623 ; SKX-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
   1624 ; SKX-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
   1625 ; SKX-NEXT:    vmovaps %zmm3, %zmm0
   1626 ; SKX-NEXT:    vmovaps %zmm4, %zmm1
   1627 ; SKX-NEXT:    retq
   1628   %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
   1629   ret <16 x double> %res
   1630 }
   1631 declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
   1632 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
   1633 ; KNL_64-LABEL: test_scatter_16i32:
   1634 ; KNL_64:       # BB#0:
   1635 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1636 ; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1637 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1638 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1639 ; KNL_64-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
   1640 ; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
   1641 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
   1642 ; KNL_64-NEXT:    retq
   1643 ;
   1644 ; KNL_32-LABEL: test_scatter_16i32:
   1645 ; KNL_32:       # BB#0:
   1646 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1647 ; KNL_32-NEXT:    vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
   1648 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1649 ; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
   1650 ; KNL_32-NEXT:    retl
   1651 ;
   1652 ; SKX-LABEL: test_scatter_16i32:
   1653 ; SKX:       # BB#0:
   1654 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1655 ; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1656 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1657 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1658 ; SKX-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
   1659 ; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm0
   1660 ; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
   1661 ; SKX-NEXT:    retq
   1662 ;
   1663 ; SKX_32-LABEL: test_scatter_16i32:
   1664 ; SKX_32:       # BB#0:
   1665 ; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1666 ; SKX_32-NEXT:    vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
   1667 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1668 ; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
   1669 ; SKX_32-NEXT:    retl
   1670   call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
   1671   ret void
   1672 }
   1673 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
   1674 ; KNL_64-LABEL: test_scatter_16i64:
   1675 ; KNL_64:       # BB#0:
   1676 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1677 ; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1678 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1679 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1680 ; KNL_64-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
   1681 ; KNL_64-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
   1682 ; KNL_64-NEXT:    retq
   1683 ;
   1684 ; KNL_32-LABEL: test_scatter_16i64:
   1685 ; KNL_32:       # BB#0:
   1686 ; KNL_32-NEXT:    pushl %ebp
   1687 ; KNL_32-NEXT:  .Ltmp6:
   1688 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1689 ; KNL_32-NEXT:  .Ltmp7:
   1690 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
   1691 ; KNL_32-NEXT:    movl %esp, %ebp
   1692 ; KNL_32-NEXT:  .Ltmp8:
   1693 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
   1694 ; KNL_32-NEXT:    andl $-64, %esp
   1695 ; KNL_32-NEXT:    subl $64, %esp
   1696 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1697 ; KNL_32-NEXT:    vpandd .LCPI36_0{1to16}, %zmm1, %zmm1
   1698 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1699 ; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
   1700 ; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
   1701 ; KNL_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
   1702 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1703 ; KNL_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
   1704 ; KNL_32-NEXT:    movl %ebp, %esp
   1705 ; KNL_32-NEXT:    popl %ebp
   1706 ; KNL_32-NEXT:    retl
   1707 ;
   1708 ; SKX-LABEL: test_scatter_16i64:
   1709 ; SKX:       # BB#0:
   1710 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1711 ; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1712 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1713 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1714 ; SKX-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
   1715 ; SKX-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
   1716 ; SKX-NEXT:    retq
   1717   call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
   1718   ret void
   1719 }
   1720 declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
   1721 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
   1722 ; KNL_64-LABEL: test_scatter_16f32:
   1723 ; KNL_64:       # BB#0:
   1724 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1725 ; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1726 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1727 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1728 ; KNL_64-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
   1729 ; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
   1730 ; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
   1731 ; KNL_64-NEXT:    retq
   1732 ;
   1733 ; KNL_32-LABEL: test_scatter_16f32:
   1734 ; KNL_32:       # BB#0:
   1735 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1736 ; KNL_32-NEXT:    vpandd .LCPI37_0{1to16}, %zmm1, %zmm1
   1737 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1738 ; KNL_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
   1739 ; KNL_32-NEXT:    retl
   1740 ;
   1741 ; SKX-LABEL: test_scatter_16f32:
   1742 ; SKX:       # BB#0:
   1743 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1744 ; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1745 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1746 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1747 ; SKX-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
   1748 ; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm0
   1749 ; SKX-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
   1750 ; SKX-NEXT:    retq
   1751   call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
   1752   ret void
   1753 }
   1754 declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
   1755 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
   1756 ; KNL_64-LABEL: test_scatter_16f64:
   1757 ; KNL_64:       # BB#0:
   1758 ; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
   1759 ; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1760 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1761 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
   1762 ; KNL_64-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
   1763 ; KNL_64-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
   1764 ; KNL_64-NEXT:    retq
   1765 ;
   1766 ; KNL_32-LABEL: test_scatter_16f64:
   1767 ; KNL_32:       # BB#0:
   1768 ; KNL_32-NEXT:    pushl %ebp
   1769 ; KNL_32-NEXT:  .Ltmp9:
   1770 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
   1771 ; KNL_32-NEXT:  .Ltmp10:
   1772 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
   1773 ; KNL_32-NEXT:    movl %esp, %ebp
   1774 ; KNL_32-NEXT:  .Ltmp11:
   1775 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
   1776 ; KNL_32-NEXT:    andl $-64, %esp
   1777 ; KNL_32-NEXT:    subl $64, %esp
   1778 ; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
   1779 ; KNL_32-NEXT:    vpandd .LCPI38_0{1to16}, %zmm1, %zmm1
   1780 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1781 ; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
   1782 ; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
   1783 ; KNL_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
   1784 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1785 ; KNL_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
   1786 ; KNL_32-NEXT:    movl %ebp, %esp
   1787 ; KNL_32-NEXT:    popl %ebp
   1788 ; KNL_32-NEXT:    retl
   1789 ;
   1790 ; SKX-LABEL: test_scatter_16f64:
   1791 ; SKX:       # BB#0:
   1792 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
   1793 ; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   1794 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
   1795 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
   1796 ; SKX-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
   1797 ; SKX-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
   1798 ; SKX-NEXT:    retq
   1799   call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
   1800   ret void
   1801 }
   1802 declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
   1803