Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
      2 ; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
      3 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
      4 ; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=skx < %s | FileCheck %s --check-prefix=SKX
      5 
      6 ; AVX512-LABEL: test1
      7 ; AVX512: vmovdqu32       (%rdi), %zmm0 {%k1} {z}
      8 
      9 ; AVX2-LABEL: test1
     10 ; AVX2: vpmaskmovd      {{.*}}(%rdi)
     11 ; AVX2: vpmaskmovd      {{.*}}(%rdi)
     12 ; AVX2-NOT: blend
     13 
     14 ; AVX_SCALAR-LABEL: test1
     15 ; AVX_SCALAR-NOT: masked
     16 ; AVX_SCALAR: extractelement
     17 ; AVX_SCALAR: insertelement
     18 ; AVX_SCALAR: extractelement
     19 ; AVX_SCALAR: insertelement
     20 define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
     21   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
     22   %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
     23   ret <16 x i32> %res
     24 }
     25 
     26 ; AVX512-LABEL: test2
     27 ; AVX512: vmovdqu32       (%rdi), %zmm0 {%k1} {z}
     28 
     29 ; AVX2-LABEL: test2
     30 ; AVX2: vpmaskmovd      {{.*}}(%rdi)
     31 ; AVX2: vpmaskmovd      {{.*}}(%rdi)
     32 ; AVX2-NOT: blend
     33 define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
     34   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
     35   %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
     36   ret <16 x i32> %res
     37 }
     38 
     39 ; AVX512-LABEL: test3
     40 ; AVX512: vmovdqu32       %zmm1, (%rdi) {%k1}
     41 
     42 ; AVX_SCALAR-LABEL: test3
     43 ; AVX_SCALAR-NOT: masked
     44 ; AVX_SCALAR: extractelement
     45 ; AVX_SCALAR: store
     46 ; AVX_SCALAR: extractelement
     47 ; AVX_SCALAR: store
     48 ; AVX_SCALAR: extractelement
     49 ; AVX_SCALAR: store
     50 define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
     51   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
     52   call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
     53   ret void
     54 }
     55 
     56 ; AVX512-LABEL: test4
     57 ; AVX512: vmovups       (%rdi), %zmm{{.*{%k[1-7]}}}
     58 
     59 ; AVX2-LABEL: test4
     60 ; AVX2: vmaskmovps      {{.*}}(%rdi)
     61 ; AVX2: vmaskmovps      {{.*}}(%rdi)
     62 ; AVX2: blend
     63 define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
     64   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
     65   %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
     66   ret <16 x float> %res
     67 }
     68 
     69 ; AVX512-LABEL: test5
     70 ; AVX512: vmovupd (%rdi), %zmm1 {%k1}
     71 
     72 ; AVX2-LABEL: test5
     73 ; AVX2: vmaskmovpd
     74 ; AVX2: vblendvpd
     75 ; AVX2: vmaskmovpd
     76 ; AVX2: vblendvpd
     77 define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
     78   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
     79   %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
     80   ret <8 x double> %res
     81 }
     82 
     83 ; AVX2-LABEL: test6
     84 ; AVX2: vmaskmovpd
     85 ; AVX2: vblendvpd
     86 
     87 ; SKX-LABEL: test6
     88 ; SKX: vmovupd {{.*}}{%k1}
     89 define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
     90   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
     91   %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
     92   ret <2 x double> %res
     93 }
     94 
     95 ; AVX2-LABEL: test7
     96 ; AVX2: vmaskmovps      {{.*}}(%rdi)
     97 ; AVX2: blend
     98 
     99 ; SKX-LABEL: test7
    100 ; SKX: vmovups (%rdi){{.*}}{%k1}
    101 define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
    102   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    103   %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
    104   ret <4 x float> %res
    105 }
    106 
    107 ; AVX2-LABEL: test8
    108 ; AVX2: vpmaskmovd      {{.*}}(%rdi)
    109 ; AVX2: blend
    110 
    111 ; SKX-LABEL: test8
    112 ; SKX: vmovdqu32 (%rdi){{.*}}{%k1}
    113 define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
    114   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    115   %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
    116   ret <4 x i32> %res
    117 }
    118 
    119 ; AVX2-LABEL: test9
    120 ; AVX2: vpmaskmovd %xmm
    121 
    122 ; SKX-LABEL: test9
    123 ; SKX: vmovdqu32 %xmm{{.*}}{%k1}
    124 define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
    125   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    126   call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
    127   ret void
    128 }
    129 
    130 ; AVX2-LABEL: test10
    131 ; AVX2: vmaskmovpd    (%rdi), %ymm
    132 ; AVX2: blend
    133 
    134 ; SKX-LABEL: test10
    135 ; SKX: vmovapd {{.*}}{%k1}
    136 define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
    137   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    138   %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
    139   ret <4 x double> %res
    140 }
    141 
    142 ; AVX2-LABEL: test11a
    143 ; AVX2: vmaskmovps
    144 ; AVX2: vblendvps
    145 
    146 ; SKX-LABEL: test11a
    147 ; SKX: vmovaps (%rdi), %ymm1 {%k1}
    148 ; AVX512-LABEL: test11a
    149 ; AVX512: kshiftlw $8
    150 ; AVX512: kshiftrw $8
    151 ; AVX512: vmovups (%rdi), %zmm1 {%k1}
    152 define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
    153   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
    154   %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
    155   ret <8 x float> %res
    156 }
    157 
    158 ; SKX-LABEL: test11b
    159 ; SKX: vmovdqu32 (%rdi), %ymm1 {%k1}
    160 ; AVX512-LABEL: test11b
    161 ; AVX512: kshiftlw        $8
    162 ; AVX512: kshiftrw        $8
    163 ; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1}
    164 define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
    165   %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
    166   ret <8 x i32> %res
    167 }
    168 
    169 ; SKX-LABEL: test11c
    170 ; SKX: vmovaps (%rdi), %ymm0 {%k1} {z}
    171 ; AVX512-LABEL: test11c
    172 ; AVX512: kshiftlw  $8
    173 ; AVX512: kshiftrw  $8
    174 ; AVX512: vmovups (%rdi), %zmm0 {%k1} {z}
    175 define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
    176   %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
    177   ret <8 x float> %res
    178 }
    179 
    180 ; SKX-LABEL: test11d
    181 ; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
    182 ; AVX512-LABEL: test11d
    183 ; AVX512: kshiftlw  $8
    184 ; AVX512: kshiftrw  $8
    185 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
    186 define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
    187   %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
    188   ret <8 x i32> %res
    189 }
    190 
    191 ; AVX2-LABEL: test12
    192 ; AVX2: vpmaskmovd %ymm
    193 
    194 ; SKX-LABEL: test12
    195 ; SKX: vmovdqu32 {{.*}}{%k1}
    196 define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
    197   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
    198   call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
    199   ret void
    200 }
    201 
    202 ; AVX512-LABEL: test13
    203 ; AVX512: vmovups       %zmm1, (%rdi) {%k1}
    204 
    205 define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
    206   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
    207   call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
    208   ret void
    209 }
    210 
    211 ; AVX2-LABEL: test14
    212 ; AVX2: vpshufd
    213 ; AVX2: vmovq
    214 ; AVX2: vmaskmovps
    215 
    216 ; SKX-LABEL: test14
    217 ; SKX: kshiftl
    218 ; SKX: kshiftr
    219 ; SKX: vmovups {{.*}}{%k1}
    220 
    221 define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
    222   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    223   call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
    224   ret void
    225 }
    226 
    227 ; AVX2-LABEL: test15
    228 ; AVX2: vpmaskmovd
    229 
    230 ; SKX-LABEL: test15:
    231 ; SKX:       ## BB#0:
    232 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    233 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    234 ; SKX-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
    235 ; SKX-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
    236 ; SKX-NEXT:    retq
    237 define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
    238   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    239   call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
    240   ret void
    241 }
    242 
    243 ; AVX2-LABEL: test16
    244 ; AVX2: vmaskmovps
    245 ; AVX2: vblendvps
    246 
    247 ; SKX-LABEL: test16
    248 ; SKX: kshiftl
    249 ; SKX: kshiftr
    250 ; SKX: vmovups {{.*}}{%k1}
    251 define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
    252   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    253   %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
    254   ret <2 x float> %res
    255 }
    256 
    257 ; AVX2-LABEL: test17
    258 ; AVX2: vpmaskmovd
    259 ; AVX2: vblendvps
    260 ; AVX2: vpmovsxdq
    261 
    262 ; SKX-LABEL: test17
    263 ; SKX: kshiftl
    264 ; SKX: kshiftr
    265 ; SKX: vmovdqu32 {{.*}}{%k1}
    266 define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
    267   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    268   %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
    269   ret <2 x i32> %res
    270 }
    271 
    272 ; AVX2-LABEL: test18
    273 ; AVX2: vmaskmovps
    274 ; AVX2-NOT: blend
    275 ; AVX2: ret
    276 define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
    277 ; SKX-LABEL: test18:
    278 ; SKX:       ## BB#0:
    279 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    280 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    281 ; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
    282 ; SKX-NEXT:    kshiftlw $2, %k0, %k0
    283 ; SKX-NEXT:    kshiftrw $2, %k0, %k1
    284 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
    285 ; SKX-NEXT:    retq
    286   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    287   %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
    288   ret <2 x float> %res
    289 }
    290 
    291 ; AVX_SCALAR-LABEL: test19
    292 ; AVX_SCALAR: load <4 x float>, <4 x float>* %addr, align 4
    293 
    294 define <4 x float> @test19(<4 x i32> %trigger, <4 x float>* %addr) {
    295   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    296   %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
    297   ret <4 x float> %res
    298 }
    299 
    300 ; AVX_SCALAR-LABEL: test20
    301 ; AVX_SCALAR: load float, {{.*}}, align 4
    302 ; AVX_SCALAR: insertelement <4 x float> undef, float
    303 ; AVX_SCALAR: select <4 x i1> <i1 true, i1 false, i1 true, i1 true>
    304 
    305 define <4 x float> @test20(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %src0) {
    306   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    307   %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 16, <4 x i1><i1 true, i1 false, i1 true, i1 true>, <4 x float> %src0)
    308   ret <4 x float> %res
    309 }
    310 
    311 ; AVX_SCALAR-LABEL: test21
    312 ; AVX_SCALAR: store <4 x i32> %val
    313 define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
    314   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    315   call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
    316   ret void
    317 }
    318 
    319 ; AVX_SCALAR-LABEL: test22
    320 ; AVX_SCALAR: extractelement <4 x i32> %val, i32 0
    321 ; AVX_SCALAR:  store i32
    322 define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
    323   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    324   call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
    325   ret void
    326 }
    327 
    328 declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
    329 declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
    330 declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
    331 declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
    332 declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
    333 declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
    334 declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
    335 declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
    336 declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
    337 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
    338 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
    339 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
    340 declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
    341 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
    342 declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
    343 declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
    344 declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
    345 declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
    346 declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
    347 declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
    348 declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
    349 
    350 declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
    351 
    352 ; AVX512-LABEL: test23
    353 ; AVX512: vmovdqu64       64(%rdi), %zmm1 {%k2} {z}
    354 ; AVX512: vmovdqu64       (%rdi), %zmm0 {%k1} {z}
    355 
    356 define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
    357   %mask = icmp eq <16 x i32*> %trigger, zeroinitializer
    358   %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
    359   ret <16 x i32*> %res
    360 }
    361 
    362 %mystruct = type { i16, i16, [1 x i8*] }
    363 
    364 declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
    365 
    366 ; AVX512-LABEL: test24
    367 ; AVX512: vmovdqu64       (%rdi), %zmm0 {%k1} {z}
    368 ; AVX512: kshiftrw        $8, %k1, %k1
    369 ; AVX512: vmovdqu64       64(%rdi), %zmm1 {%k1} {z}
    370 
    371 define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
    372   %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
    373   ret <16 x %mystruct*> %res
    374 }
    375 
    376 define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
    377 ; SKX-LABEL: test_store_16i64:
    378 ; SKX:       ## BB#0:
    379 ; SKX-NEXT:    vpmovb2m %xmm0, %k1
    380 ; SKX-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
    381 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
    382 ; SKX-NEXT:    vmovdqu64 %zmm2, 64(%rdi) {%k1}
    383 ; SKX-NEXT:    retq
    384   call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
    385   ret void
    386 }
    387 declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
    388 define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
    389 ; SKX-LABEL: test_store_16f64:
    390 ; SKX:       ## BB#0:
    391 ; SKX-NEXT:    vpmovb2m %xmm0, %k1
    392 ; SKX-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
    393 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
    394 ; SKX-NEXT:    vmovupd %zmm2, 64(%rdi) {%k1}
    395 ; SKX-NEXT:    retq
    396   call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
    397   ret void
    398 }
    399 declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
    400 define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
    401 ; SKX-LABEL: test_load_16i64:
    402 ; SKX:       ## BB#0:
    403 ; SKX-NEXT:    vpmovb2m %xmm0, %k1
    404 ; SKX-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1}
    405 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
    406 ; SKX-NEXT:    vmovdqu64 64(%rdi), %zmm2 {%k1}
    407 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    408 ; SKX-NEXT:    vmovaps %zmm2, %zmm1
    409 ; SKX-NEXT:    retq
    410   %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
    411   ret <16 x i64> %res
    412 }
    413 declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
    414 define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
    415 ; SKX-LABEL: test_load_16f64:
    416 ; SKX:       ## BB#0:
    417 ; SKX-NEXT:    vpmovb2m %xmm0, %k1
    418 ; SKX-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
    419 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
    420 ; SKX-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
    421 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    422 ; SKX-NEXT:    vmovaps %zmm2, %zmm1
    423 ; SKX-NEXT:    retq
    424   %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
    425   ret <16 x double> %res
    426 }
    427 declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
    428 
    429 define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0)  {
    430 ; SKX-LABEL: test_load_32f64:
    431 ; SKX:       ## BB#0:
    432 ; SKX-NEXT:    vpmovb2m %ymm0, %k1
    433 ; SKX-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
    434 ; SKX-NEXT:    kshiftrd $16, %k1, %k2
    435 ; SKX-NEXT:    vmovupd 128(%rdi), %zmm3 {%k2}
    436 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
    437 ; SKX-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
    438 ; SKX-NEXT:    kshiftrw $8, %k2, %k1
    439 ; SKX-NEXT:    vmovupd 192(%rdi), %zmm4 {%k1}
    440 ; SKX-NEXT:    vmovaps %zmm1, %zmm0
    441 ; SKX-NEXT:    vmovaps %zmm2, %zmm1
    442 ; SKX-NEXT:    vmovaps %zmm3, %zmm2
    443 ; SKX-NEXT:    vmovaps %zmm4, %zmm3
    444 ; SKX-NEXT:    retq
    445   %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
    446   ret <32 x double> %res
    447 }
    448 declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
    449