Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX  --check-prefix=AVX1
      3 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      4 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
      5 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX
      6 
      7 ; To test for the case where masked load/store is not legal, we should add a run with a target
      8 ; that does not have AVX, but that case should probably be a separate test file using less tests
      9 ; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov.
     10 
     11 define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
     12 ; AVX-LABEL: loadv1:
     13 ; AVX:       ## %bb.0:
     14 ; AVX-NEXT:    testq %rdi, %rdi
     15 ; AVX-NEXT:    ## implicit-def: $xmm1
     16 ; AVX-NEXT:    je LBB0_1
     17 ; AVX-NEXT:  ## %bb.2: ## %else
     18 ; AVX-NEXT:    testq %rdi, %rdi
     19 ; AVX-NEXT:    jne LBB0_3
     20 ; AVX-NEXT:  LBB0_4: ## %else
     21 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
     22 ; AVX-NEXT:    retq
     23 ; AVX-NEXT:  LBB0_1: ## %cond.load
     24 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
     25 ; AVX-NEXT:    testq %rdi, %rdi
     26 ; AVX-NEXT:    je LBB0_4
     27 ; AVX-NEXT:  LBB0_3: ## %else
     28 ; AVX-NEXT:    vmovaps %xmm0, %xmm1
     29 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
     30 ; AVX-NEXT:    retq
     31 ;
     32 ; AVX512F-LABEL: loadv1:
     33 ; AVX512F:       ## %bb.0:
     34 ; AVX512F-NEXT:    testq %rdi, %rdi
     35 ; AVX512F-NEXT:    ## implicit-def: $xmm1
     36 ; AVX512F-NEXT:    jne LBB0_2
     37 ; AVX512F-NEXT:  ## %bb.1: ## %cond.load
     38 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
     39 ; AVX512F-NEXT:  LBB0_2: ## %else
     40 ; AVX512F-NEXT:    testq %rdi, %rdi
     41 ; AVX512F-NEXT:    sete %al
     42 ; AVX512F-NEXT:    kmovw %eax, %k1
     43 ; AVX512F-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
     44 ; AVX512F-NEXT:    retq
     45 ;
     46 ; SKX-LABEL: loadv1:
     47 ; SKX:       ## %bb.0:
     48 ; SKX-NEXT:    testq %rdi, %rdi
     49 ; SKX-NEXT:    ## implicit-def: $xmm1
     50 ; SKX-NEXT:    jne LBB0_2
     51 ; SKX-NEXT:  ## %bb.1: ## %cond.load
     52 ; SKX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
     53 ; SKX-NEXT:  LBB0_2: ## %else
     54 ; SKX-NEXT:    testq %rdi, %rdi
     55 ; SKX-NEXT:    sete %al
     56 ; SKX-NEXT:    kmovd %eax, %k1
     57 ; SKX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
     58 ; SKX-NEXT:    retq
     59   %mask = icmp eq <1 x i64> %trigger, zeroinitializer
     60   %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst)
     61   ret <1 x double> %res
     62 }
     63 declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
     64 
     65 define void @storev1(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) {
     66 ; AVX-LABEL: storev1:
     67 ; AVX:       ## %bb.0:
     68 ; AVX-NEXT:    testl %edi, %edi
     69 ; AVX-NEXT:    je LBB1_1
     70 ; AVX-NEXT:  ## %bb.2: ## %else
     71 ; AVX-NEXT:    retq
     72 ; AVX-NEXT:  LBB1_1: ## %cond.store
     73 ; AVX-NEXT:    movl %edx, (%rsi)
     74 ; AVX-NEXT:    retq
     75 ;
     76 ; AVX512-LABEL: storev1:
     77 ; AVX512:       ## %bb.0:
     78 ; AVX512-NEXT:    testl %edi, %edi
     79 ; AVX512-NEXT:    je LBB1_1
     80 ; AVX512-NEXT:  ## %bb.2: ## %else
     81 ; AVX512-NEXT:    retq
     82 ; AVX512-NEXT:  LBB1_1: ## %cond.store
     83 ; AVX512-NEXT:    movl %edx, (%rsi)
     84 ; AVX512-NEXT:    retq
     85   %mask = icmp eq <1 x i32> %trigger, zeroinitializer
     86   call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask)
     87   ret void
     88 }
     89 declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>)
     90 
     91 define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
     92 ; AVX-LABEL: test6:
     93 ; AVX:       ## %bb.0:
     94 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
     95 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
     96 ; AVX-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
     97 ; AVX-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
     98 ; AVX-NEXT:    retq
     99 ;
    100 ; AVX512F-LABEL: test6:
    101 ; AVX512F:       ## %bb.0:
    102 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
    103 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
    104 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
    105 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
    106 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
    107 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
    108 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
    109 ; AVX512F-NEXT:    vzeroupper
    110 ; AVX512F-NEXT:    retq
    111 ;
    112 ; SKX-LABEL: test6:
    113 ; SKX:       ## %bb.0:
    114 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
    115 ; SKX-NEXT:    vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
    116 ; SKX-NEXT:    retq
    117   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
    118   %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
    119   ret <2 x double> %res
    120 }
    121 
    122 define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
    123 ; AVX-LABEL: test7:
    124 ; AVX:       ## %bb.0:
    125 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    126 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
    127 ; AVX-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
    128 ; AVX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
    129 ; AVX-NEXT:    retq
    130 ;
    131 ; AVX512F-LABEL: test7:
    132 ; AVX512F:       ## %bb.0:
    133 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
    134 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
    135 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    136 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
    137 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
    138 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
    139 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
    140 ; AVX512F-NEXT:    vzeroupper
    141 ; AVX512F-NEXT:    retq
    142 ;
    143 ; SKX-LABEL: test7:
    144 ; SKX:       ## %bb.0:
    145 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
    146 ; SKX-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
    147 ; SKX-NEXT:    retq
    148   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    149   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
    150   ret <4 x float> %res
    151 }
    152 
    153 define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
    154 ; AVX1-LABEL: test8:
    155 ; AVX1:       ## %bb.0:
    156 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    157 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
    158 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
    159 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
    160 ; AVX1-NEXT:    retq
    161 ;
    162 ; AVX2-LABEL: test8:
    163 ; AVX2:       ## %bb.0:
    164 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    165 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
    166 ; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
    167 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
    168 ; AVX2-NEXT:    retq
    169 ;
    170 ; AVX512F-LABEL: test8:
    171 ; AVX512F:       ## %bb.0:
    172 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
    173 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
    174 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    175 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
    176 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
    177 ; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
    178 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
    179 ; AVX512F-NEXT:    vzeroupper
    180 ; AVX512F-NEXT:    retq
    181 ;
    182 ; SKX-LABEL: test8:
    183 ; SKX:       ## %bb.0:
    184 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
    185 ; SKX-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
    186 ; SKX-NEXT:    retq
    187   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    188   %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
    189   ret <4 x i32> %res
    190 }
    191 
    192 define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
    193 ; AVX1-LABEL: test9:
    194 ; AVX1:       ## %bb.0:
    195 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    196 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
    197 ; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
    198 ; AVX1-NEXT:    retq
    199 ;
    200 ; AVX2-LABEL: test9:
    201 ; AVX2:       ## %bb.0:
    202 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    203 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
    204 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
    205 ; AVX2-NEXT:    retq
    206 ;
    207 ; AVX512F-LABEL: test9:
    208 ; AVX512F:       ## %bb.0:
    209 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
    210 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
    211 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    212 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
    213 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
    214 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
    215 ; AVX512F-NEXT:    vzeroupper
    216 ; AVX512F-NEXT:    retq
    217 ;
    218 ; SKX-LABEL: test9:
    219 ; SKX:       ## %bb.0:
    220 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
    221 ; SKX-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
    222 ; SKX-NEXT:    retq
    223   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    224   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
    225   ret void
    226 }
    227 
    228 define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
    229 ; AVX1-LABEL: test10:
    230 ; AVX1:       ## %bb.0:
    231 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    232 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
    233 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
    234 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    235 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
    236 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    237 ; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
    238 ; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
    239 ; AVX1-NEXT:    retq
    240 ;
    241 ; AVX2-LABEL: test10:
    242 ; AVX2:       ## %bb.0:
    243 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    244 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
    245 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
    246 ; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
    247 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
    248 ; AVX2-NEXT:    retq
    249 ;
    250 ; AVX512F-LABEL: test10:
    251 ; AVX512F:       ## %bb.0:
    252 ; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
    253 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
    254 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    255 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
    256 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
    257 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
    258 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    259 ; AVX512F-NEXT:    retq
    260 ;
    261 ; SKX-LABEL: test10:
    262 ; SKX:       ## %bb.0:
    263 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
    264 ; SKX-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
    265 ; SKX-NEXT:    retq
    266   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    267   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
    268   ret <4 x double> %res
    269 }
    270 
    271 define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
    272 ; AVX1-LABEL: test10b:
    273 ; AVX1:       ## %bb.0:
    274 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    275 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
    276 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
    277 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    278 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
    279 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    280 ; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
    281 ; AVX1-NEXT:    retq
    282 ;
    283 ; AVX2-LABEL: test10b:
    284 ; AVX2:       ## %bb.0:
    285 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    286 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
    287 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
    288 ; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
    289 ; AVX2-NEXT:    retq
    290 ;
    291 ; AVX512F-LABEL: test10b:
    292 ; AVX512F:       ## %bb.0:
    293 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
    294 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    295 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
    296 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
    297 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
    298 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    299 ; AVX512F-NEXT:    retq
    300 ;
    301 ; SKX-LABEL: test10b:
    302 ; SKX:       ## %bb.0:
    303 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
    304 ; SKX-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
    305 ; SKX-NEXT:    retq
    306   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    307   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
    308   ret <4 x double> %res
    309 }
    310 
    311 define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
    312 ; AVX1-LABEL: test11a:
    313 ; AVX1:       ## %bb.0:
    314 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    315 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    316 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
    317 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
    318 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    319 ; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
    320 ; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
    321 ; AVX1-NEXT:    retq
    322 ;
    323 ; AVX2-LABEL: test11a:
    324 ; AVX2:       ## %bb.0:
    325 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    326 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
    327 ; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
    328 ; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
    329 ; AVX2-NEXT:    retq
    330 ;
    331 ; AVX512F-LABEL: test11a:
    332 ; AVX512F:       ## %bb.0:
    333 ; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
    334 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
    335 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    336 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
    337 ; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
    338 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
    339 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    340 ; AVX512F-NEXT:    retq
    341 ;
    342 ; SKX-LABEL: test11a:
    343 ; SKX:       ## %bb.0:
    344 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1
    345 ; SKX-NEXT:    vblendmps (%rdi), %ymm1, %ymm0 {%k1}
    346 ; SKX-NEXT:    retq
    347   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
    348   %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
    349   ret <8 x float> %res
    350 }
    351 
    352 define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
    353 ; AVX1-LABEL: test11b:
    354 ; AVX1:       ## %bb.0:
    355 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    356 ; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
    357 ; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
    358 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
    359 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
    360 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
    361 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    362 ; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
    363 ; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
    364 ; AVX1-NEXT:    retq
    365 ;
    366 ; AVX2-LABEL: test11b:
    367 ; AVX2:       ## %bb.0:
    368 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    369 ; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
    370 ; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
    371 ; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2
    372 ; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
    373 ; AVX2-NEXT:    retq
    374 ;
    375 ; AVX512F-LABEL: test11b:
    376 ; AVX512F:       ## %bb.0:
    377 ; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
    378 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
    379 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    380 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
    381 ; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
    382 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    383 ; AVX512F-NEXT:    retq
    384 ;
    385 ; SKX-LABEL: test11b:
    386 ; SKX:       ## %bb.0:
    387 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
    388 ; SKX-NEXT:    vpmovw2m %xmm0, %k1
    389 ; SKX-NEXT:    vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
    390 ; SKX-NEXT:    retq
    391   %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
    392   ret <8 x i32> %res
    393 }
    394 
    395 define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
    396 ; AVX1-LABEL: test11c:
    397 ; AVX1:       ## %bb.0:
    398 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    399 ; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
    400 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
    401 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
    402 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
    403 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
    404 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    405 ; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
    406 ; AVX1-NEXT:    retq
    407 ;
    408 ; AVX2-LABEL: test11c:
    409 ; AVX2:       ## %bb.0:
    410 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    411 ; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
    412 ; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
    413 ; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
    414 ; AVX2-NEXT:    retq
    415 ;
    416 ; AVX512F-LABEL: test11c:
    417 ; AVX512F:       ## %bb.0:
    418 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
    419 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    420 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
    421 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
    422 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    423 ; AVX512F-NEXT:    retq
    424 ;
    425 ; SKX-LABEL: test11c:
    426 ; SKX:       ## %bb.0:
    427 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
    428 ; SKX-NEXT:    vpmovw2m %xmm0, %k1
    429 ; SKX-NEXT:    vmovaps (%rdi), %ymm0 {%k1} {z}
    430 ; SKX-NEXT:    retq
    431   %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
    432   ret <8 x float> %res
    433 }
    434 
    435 define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
    436 ; AVX1-LABEL: test11d:
    437 ; AVX1:       ## %bb.0:
    438 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    439 ; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
    440 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
    441 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
    442 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
    443 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
    444 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    445 ; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
    446 ; AVX1-NEXT:    retq
    447 ;
    448 ; AVX2-LABEL: test11d:
    449 ; AVX2:       ## %bb.0:
    450 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    451 ; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
    452 ; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
    453 ; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
    454 ; AVX2-NEXT:    retq
    455 ;
    456 ; AVX512F-LABEL: test11d:
    457 ; AVX512F:       ## %bb.0:
    458 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
    459 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    460 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
    461 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
    462 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    463 ; AVX512F-NEXT:    retq
    464 ;
    465 ; SKX-LABEL: test11d:
    466 ; SKX:       ## %bb.0:
    467 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
    468 ; SKX-NEXT:    vpmovw2m %xmm0, %k1
    469 ; SKX-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} {z}
    470 ; SKX-NEXT:    retq
    471   %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
    472   ret <8 x i32> %res
    473 }
    474 
    475 define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
    476 ; AVX1-LABEL: test12:
    477 ; AVX1:       ## %bb.0:
    478 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    479 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    480 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
    481 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
    482 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    483 ; AVX1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
    484 ; AVX1-NEXT:    vzeroupper
    485 ; AVX1-NEXT:    retq
    486 ;
    487 ; AVX2-LABEL: test12:
    488 ; AVX2:       ## %bb.0:
    489 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    490 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
    491 ; AVX2-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
    492 ; AVX2-NEXT:    vzeroupper
    493 ; AVX2-NEXT:    retq
    494 ;
    495 ; AVX512F-LABEL: test12:
    496 ; AVX512F:       ## %bb.0:
    497 ; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
    498 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
    499 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    500 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
    501 ; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
    502 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
    503 ; AVX512F-NEXT:    vzeroupper
    504 ; AVX512F-NEXT:    retq
    505 ;
    506 ; SKX-LABEL: test12:
    507 ; SKX:       ## %bb.0:
    508 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1
    509 ; SKX-NEXT:    vmovdqu32 %ymm1, (%rdi) {%k1}
    510 ; SKX-NEXT:    vzeroupper
    511 ; SKX-NEXT:    retq
    512   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
    513   call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
    514   ret void
    515 }
    516 
    517 define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
    518 ; AVX1-LABEL: test14:
    519 ; AVX1:       ## %bb.0:
    520 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    521 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    522 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
    523 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    524 ; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
    525 ; AVX1-NEXT:    retq
    526 ;
    527 ; AVX2-LABEL: test14:
    528 ; AVX2:       ## %bb.0:
    529 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    530 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    531 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
    532 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    533 ; AVX2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
    534 ; AVX2-NEXT:    retq
    535 ;
    536 ; AVX512F-LABEL: test14:
    537 ; AVX512F:       ## %bb.0:
    538 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
    539 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    540 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    541 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
    542 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
    543 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
    544 ; AVX512F-NEXT:    vmovups %zmm1, (%rdi) {%k1}
    545 ; AVX512F-NEXT:    vzeroupper
    546 ; AVX512F-NEXT:    retq
    547 ;
    548 ; SKX-LABEL: test14:
    549 ; SKX:       ## %bb.0:
    550 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    551 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    552 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
    553 ; SKX-NEXT:    vmovups %xmm1, (%rdi) {%k1}
    554 ; SKX-NEXT:    retq
    555   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    556   call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
    557   ret void
    558 }
    559 
    560 define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
    561 ; AVX1-LABEL: test15:
    562 ; AVX1:       ## %bb.0:
    563 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    564 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    565 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
    566 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    567 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
    568 ; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
    569 ; AVX1-NEXT:    retq
    570 ;
    571 ; AVX2-LABEL: test15:
    572 ; AVX2:       ## %bb.0:
    573 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    574 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    575 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
    576 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    577 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    578 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
    579 ; AVX2-NEXT:    retq
    580 ;
    581 ; AVX512F-LABEL: test15:
    582 ; AVX512F:       ## %bb.0:
    583 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    584 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    585 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
    586 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    587 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
    588 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
    589 ; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
    590 ; AVX512F-NEXT:    vzeroupper
    591 ; AVX512F-NEXT:    retq
    592 ;
    593 ; SKX-LABEL: test15:
    594 ; SKX:       ## %bb.0:
    595 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    596 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    597 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
    598 ; SKX-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
    599 ; SKX-NEXT:    retq
    600   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    601   call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
    602   ret void
    603 }
    604 
    605 define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
    606 ; AVX1-LABEL: test16:
    607 ; AVX1:       ## %bb.0:
    608 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    609 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    610 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
    611 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    612 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
    613 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
    614 ; AVX1-NEXT:    retq
    615 ;
    616 ; AVX2-LABEL: test16:
    617 ; AVX2:       ## %bb.0:
    618 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    619 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    620 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
    621 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    622 ; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
    623 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
    624 ; AVX2-NEXT:    retq
    625 ;
    626 ; AVX512F-LABEL: test16:
    627 ; AVX512F:       ## %bb.0:
    628 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
    629 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    630 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    631 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
    632 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
    633 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
    634 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
    635 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
    636 ; AVX512F-NEXT:    vzeroupper
    637 ; AVX512F-NEXT:    retq
    638 ;
    639 ; SKX-LABEL: test16:
    640 ; SKX:       ## %bb.0:
    641 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    642 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    643 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
    644 ; SKX-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
    645 ; SKX-NEXT:    retq
    646   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    647   %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
    648   ret <2 x float> %res
    649 }
    650 
    651 define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
    652 ; AVX1-LABEL: test17:
    653 ; AVX1:       ## %bb.0:
    654 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    655 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    656 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
    657 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    658 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
    659 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
    660 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
    661 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
    662 ; AVX1-NEXT:    retq
    663 ;
    664 ; AVX2-LABEL: test17:
    665 ; AVX2:       ## %bb.0:
    666 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    667 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    668 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
    669 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    670 ; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
    671 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
    672 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
    673 ; AVX2-NEXT:    vpmovsxdq %xmm0, %xmm0
    674 ; AVX2-NEXT:    retq
    675 ;
    676 ; AVX512F-LABEL: test17:
    677 ; AVX512F:       ## %bb.0:
    678 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    679 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    680 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
    681 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    682 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
    683 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
    684 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
    685 ; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
    686 ; AVX512F-NEXT:    vzeroupper
    687 ; AVX512F-NEXT:    retq
    688 ;
    689 ; SKX-LABEL: test17:
    690 ; SKX:       ## %bb.0:
    691 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    692 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
    693 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
    694 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    695 ; SKX-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
    696 ; SKX-NEXT:    vpmovsxdq %xmm0, %xmm0
    697 ; SKX-NEXT:    retq
    698   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    699   %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
    700   ret <2 x i32> %res
    701 }
    702 
    703 define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
    704 ; AVX1-LABEL: test18:
    705 ; AVX1:       ## %bb.0:
    706 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    707 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
    708 ; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
    709 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    710 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
    711 ; AVX1-NEXT:    retq
    712 ;
    713 ; AVX2-LABEL: test18:
    714 ; AVX2:       ## %bb.0:
    715 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    716 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    717 ; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
    718 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
    719 ; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
    720 ; AVX2-NEXT:    retq
    721 ;
    722 ; AVX512F-LABEL: test18:
    723 ; AVX512F:       ## %bb.0:
    724 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    725 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    726 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
    727 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
    728 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
    729 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
    730 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
    731 ; AVX512F-NEXT:    vzeroupper
    732 ; AVX512F-NEXT:    retq
    733 ;
    734 ; SKX-LABEL: test18:
    735 ; SKX:       ## %bb.0:
    736 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    737 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
    738 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
    739 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
    740 ; SKX-NEXT:    retq
    741   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
    742   %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
    743   ret <2 x float> %res
    744 }
    745 
    746 define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
    747 ; AVX-LABEL: load_all:
    748 ; AVX:       ## %bb.0:
    749 ; AVX-NEXT:    vmovups (%rdi), %xmm0
    750 ; AVX-NEXT:    retq
    751 ;
    752 ; AVX512F-LABEL: load_all:
    753 ; AVX512F:       ## %bb.0:
    754 ; AVX512F-NEXT:    movw $15, %ax
    755 ; AVX512F-NEXT:    kmovw %eax, %k1
    756 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
    757 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
    758 ; AVX512F-NEXT:    vzeroupper
    759 ; AVX512F-NEXT:    retq
    760 ;
    761 ; SKX-LABEL: load_all:
    762 ; SKX:       ## %bb.0:
    763 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
    764 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
    765 ; SKX-NEXT:    retq
    766   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
    767   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
    768   ret <4 x float> %res
    769 }
    770 
    771 ;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
    772 
    773 ; 128-bit FP vectors are supported with AVX.
    774 
    775 define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
    776 ; AVX-LABEL: mload_constmask_v4f32:
    777 ; AVX:       ## %bb.0:
    778 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
    779 ; AVX-NEXT:    retq
    780 ;
    781 ; AVX512F-LABEL: mload_constmask_v4f32:
    782 ; AVX512F:       ## %bb.0:
    783 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
    784 ; AVX512F-NEXT:    movw $13, %ax
    785 ; AVX512F-NEXT:    kmovw %eax, %k1
    786 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
    787 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
    788 ; AVX512F-NEXT:    vzeroupper
    789 ; AVX512F-NEXT:    retq
    790 ;
    791 ; SKX-LABEL: mload_constmask_v4f32:
    792 ; SKX:       ## %bb.0:
    793 ; SKX-NEXT:    movb $13, %al
    794 ; SKX-NEXT:    kmovd %eax, %k1
    795 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1}
    796 ; SKX-NEXT:    retq
    797   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
    798   ret <4 x float> %res
    799 }
    800 
    801 define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) {
    802 ; AVX-LABEL: mload_constmask_v2f64:
    803 ; AVX:       ## %bb.0:
    804 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
    805 ; AVX-NEXT:    retq
    806 ;
    807 ; AVX512-LABEL: mload_constmask_v2f64:
    808 ; AVX512:       ## %bb.0:
    809 ; AVX512-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
    810 ; AVX512-NEXT:    retq
    811   %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
    812   ret <2 x double> %res
    813 }
    814 
    815 ; 128-bit integer vectors are supported with AVX2.
    816 
    817 define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
    818 ; AVX1-LABEL: mload_constmask_v4i32:
    819 ; AVX1:       ## %bb.0:
    820 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
    821 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm1, %xmm1
    822 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    823 ; AVX1-NEXT:    retq
    824 ;
    825 ; AVX2-LABEL: mload_constmask_v4i32:
    826 ; AVX2:       ## %bb.0:
    827 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
    828 ; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm1, %xmm1
    829 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    830 ; AVX2-NEXT:    retq
    831 ;
    832 ; AVX512F-LABEL: mload_constmask_v4i32:
    833 ; AVX512F:       ## %bb.0:
    834 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
    835 ; AVX512F-NEXT:    movw $14, %ax
    836 ; AVX512F-NEXT:    kmovw %eax, %k1
    837 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
    838 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
    839 ; AVX512F-NEXT:    vzeroupper
    840 ; AVX512F-NEXT:    retq
    841 ;
    842 ; SKX-LABEL: mload_constmask_v4i32:
    843 ; SKX:       ## %bb.0:
    844 ; SKX-NEXT:    movb $14, %al
    845 ; SKX-NEXT:    kmovd %eax, %k1
    846 ; SKX-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
    847 ; SKX-NEXT:    retq
    848   %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
    849   ret <4 x i32> %res
    850 }
    851 
    852 define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
    853 ; AVX-LABEL: mload_constmask_v2i64:
    854 ; AVX:       ## %bb.0:
    855 ; AVX-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm0
    856 ; AVX-NEXT:    retq
    857 ;
    858 ; AVX512-LABEL: mload_constmask_v2i64:
    859 ; AVX512:       ## %bb.0:
    860 ; AVX512-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm0
    861 ; AVX512-NEXT:    retq
    862   %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
    863   ret <2 x i64> %res
    864 }
    865 
    866 ; 256-bit FP vectors are supported with AVX.
    867 
    868 define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
    869 ; AVX-LABEL: mload_constmask_v8f32:
    870 ; AVX:       ## %bb.0:
    871 ; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
    872 ; AVX-NEXT:    vmaskmovps (%rdi), %ymm1, %ymm1
    873 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
    874 ; AVX-NEXT:    retq
    875 ;
    876 ; AVX512F-LABEL: mload_constmask_v8f32:
    877 ; AVX512F:       ## %bb.0:
    878 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
    879 ; AVX512F-NEXT:    movw $7, %ax
    880 ; AVX512F-NEXT:    kmovw %eax, %k1
    881 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
    882 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    883 ; AVX512F-NEXT:    retq
    884 ;
    885 ; SKX-LABEL: mload_constmask_v8f32:
    886 ; SKX:       ## %bb.0:
    887 ; SKX-NEXT:    movb $7, %al
    888 ; SKX-NEXT:    kmovd %eax, %k1
    889 ; SKX-NEXT:    vmovups (%rdi), %ymm0 {%k1}
    890 ; SKX-NEXT:    retq
    891   %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
    892   ret <8 x float> %res
    893 }
    894 
    895 define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
    896 ; AVX-LABEL: mload_constmask_v4f64:
    897 ; AVX:       ## %bb.0:
    898 ; AVX-NEXT:    vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
    899 ; AVX-NEXT:    vmaskmovpd (%rdi), %ymm1, %ymm1
    900 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
    901 ; AVX-NEXT:    retq
    902 ;
    903 ; AVX512F-LABEL: mload_constmask_v4f64:
    904 ; AVX512F:       ## %bb.0:
    905 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
    906 ; AVX512F-NEXT:    movb $7, %al
    907 ; AVX512F-NEXT:    kmovw %eax, %k1
    908 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
    909 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    910 ; AVX512F-NEXT:    retq
    911 ;
    912 ; SKX-LABEL: mload_constmask_v4f64:
    913 ; SKX:       ## %bb.0:
    914 ; SKX-NEXT:    movb $7, %al
    915 ; SKX-NEXT:    kmovd %eax, %k1
    916 ; SKX-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
    917 ; SKX-NEXT:    retq
    918   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
    919   ret <4 x double> %res
    920 }
    921 
    922 ; 256-bit integer vectors are supported with AVX2.
    923 
    924 define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
    925 ; AVX-LABEL: mload_constmask_v8i32:
    926 ; AVX:       ## %bb.0:
    927 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
    928 ; AVX-NEXT:    retq
    929 ;
    930 ; AVX512F-LABEL: mload_constmask_v8i32:
    931 ; AVX512F:       ## %bb.0:
    932 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
    933 ; AVX512F-NEXT:    movw $135, %ax
    934 ; AVX512F-NEXT:    kmovw %eax, %k1
    935 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
    936 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    937 ; AVX512F-NEXT:    retq
    938 ;
    939 ; SKX-LABEL: mload_constmask_v8i32:
    940 ; SKX:       ## %bb.0:
    941 ; SKX-NEXT:    movb $-121, %al
    942 ; SKX-NEXT:    kmovd %eax, %k1
    943 ; SKX-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1}
    944 ; SKX-NEXT:    retq
    945   %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
    946   ret <8 x i32> %res
    947 }
    948 
    949 define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
    950 ; AVX-LABEL: mload_constmask_v4i64:
    951 ; AVX:       ## %bb.0:
    952 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
    953 ; AVX-NEXT:    retq
    954 ;
    955 ; AVX512F-LABEL: mload_constmask_v4i64:
    956 ; AVX512F:       ## %bb.0:
    957 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
    958 ; AVX512F-NEXT:    movb $9, %al
    959 ; AVX512F-NEXT:    kmovw %eax, %k1
    960 ; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1}
    961 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
    962 ; AVX512F-NEXT:    retq
    963 ;
    964 ; SKX-LABEL: mload_constmask_v4i64:
    965 ; SKX:       ## %bb.0:
    966 ; SKX-NEXT:    movb $9, %al
    967 ; SKX-NEXT:    kmovd %eax, %k1
    968 ; SKX-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1}
    969 ; SKX-NEXT:    retq
    970   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
    971   ret <4 x i64> %res
    972 }
    973 
    974 ; 512-bit FP vectors are supported with AVX512.
    975 
    976 define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
    977 ; AVX-LABEL: mload_constmask_v8f64:
    978 ; AVX:       ## %bb.0:
    979 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
    980 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
    981 ; AVX-NEXT:    retq
    982 ;
    983 ; AVX512F-LABEL: mload_constmask_v8f64:
    984 ; AVX512F:       ## %bb.0:
    985 ; AVX512F-NEXT:    movb $-121, %al
    986 ; AVX512F-NEXT:    kmovw %eax, %k1
    987 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
    988 ; AVX512F-NEXT:    retq
    989 ;
    990 ; SKX-LABEL: mload_constmask_v8f64:
    991 ; SKX:       ## %bb.0:
    992 ; SKX-NEXT:    movb $-121, %al
    993 ; SKX-NEXT:    kmovd %eax, %k1
    994 ; SKX-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
    995 ; SKX-NEXT:    retq
    996   %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
    997   ret <8 x double> %res
    998 }
    999 
   1000 ; If the pass-through operand is undef, no blend is needed.
   1001 
   1002 define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
   1003 ; AVX-LABEL: mload_constmask_v4f64_undef_passthrough:
   1004 ; AVX:       ## %bb.0:
   1005 ; AVX-NEXT:    vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
   1006 ; AVX-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
   1007 ; AVX-NEXT:    retq
   1008 ;
   1009 ; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
   1010 ; AVX512F:       ## %bb.0:
   1011 ; AVX512F-NEXT:    movb $7, %al
   1012 ; AVX512F-NEXT:    kmovw %eax, %k1
   1013 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
   1014 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
   1015 ; AVX512F-NEXT:    retq
   1016 ;
   1017 ; SKX-LABEL: mload_constmask_v4f64_undef_passthrough:
   1018 ; SKX:       ## %bb.0:
   1019 ; SKX-NEXT:    movb $7, %al
   1020 ; SKX-NEXT:    kmovd %eax, %k1
   1021 ; SKX-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z}
   1022 ; SKX-NEXT:    retq
   1023   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
   1024   ret <4 x double> %res
   1025 }
   1026 
   1027 define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
   1028 ; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
   1029 ; AVX1:       ## %bb.0:
   1030 ; AVX1-NEXT:    vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
   1031 ; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
   1032 ; AVX1-NEXT:    retq
   1033 ;
   1034 ; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
   1035 ; AVX2:       ## %bb.0:
   1036 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
   1037 ; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
   1038 ; AVX2-NEXT:    retq
   1039 ;
   1040 ; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
   1041 ; AVX512F:       ## %bb.0:
   1042 ; AVX512F-NEXT:    movb $6, %al
   1043 ; AVX512F-NEXT:    kmovw %eax, %k1
   1044 ; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
   1045 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
   1046 ; AVX512F-NEXT:    retq
   1047 ;
   1048 ; SKX-LABEL: mload_constmask_v4i64_undef_passthrough:
   1049 ; SKX:       ## %bb.0:
   1050 ; SKX-NEXT:    movb $6, %al
   1051 ; SKX-NEXT:    kmovd %eax, %k1
   1052 ; SKX-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z}
   1053 ; SKX-NEXT:    retq
   1054   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
   1055   ret <4 x i64> %res
   1056 }
   1057 
   1058 define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
   1059 ; AVX1-LABEL: test21:
   1060 ; AVX1:       ## %bb.0:
   1061 ; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
   1062 ; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
   1063 ; AVX1-NEXT:    retq
   1064 ;
   1065 ; AVX2-LABEL: test21:
   1066 ; AVX2:       ## %bb.0:
   1067 ; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
   1068 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
   1069 ; AVX2-NEXT:    retq
   1070 ;
   1071 ; AVX512F-LABEL: test21:
   1072 ; AVX512F:       ## %bb.0:
   1073 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
   1074 ; AVX512F-NEXT:    movw $15, %ax
   1075 ; AVX512F-NEXT:    kmovw %eax, %k1
   1076 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
   1077 ; AVX512F-NEXT:    vzeroupper
   1078 ; AVX512F-NEXT:    retq
   1079 ;
   1080 ; SKX-LABEL: test21:
   1081 ; SKX:       ## %bb.0:
   1082 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
   1083 ; SKX-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
   1084 ; SKX-NEXT:    retq
   1085   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   1086   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
   1087   ret void
   1088 }
   1089 
   1090 ;  When only one element of the mask is set, reduce to a scalar store.
   1091 
   1092 define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
   1093 ; AVX-LABEL: one_mask_bit_set1:
   1094 ; AVX:       ## %bb.0:
   1095 ; AVX-NEXT:    vmovss %xmm0, (%rdi)
   1096 ; AVX-NEXT:    retq
   1097 ;
   1098 ; AVX512-LABEL: one_mask_bit_set1:
   1099 ; AVX512:       ## %bb.0:
   1100 ; AVX512-NEXT:    vmovss %xmm0, (%rdi)
   1101 ; AVX512-NEXT:    retq
   1102   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
   1103   ret void
   1104 }
   1105 
   1106 ; Choose a different element to show that the correct address offset is produced.
   1107 
   1108 define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
   1109 ; AVX-LABEL: one_mask_bit_set2:
   1110 ; AVX:       ## %bb.0:
   1111 ; AVX-NEXT:    vextractps $2, %xmm0, 8(%rdi)
   1112 ; AVX-NEXT:    retq
   1113 ;
   1114 ; AVX512-LABEL: one_mask_bit_set2:
   1115 ; AVX512:       ## %bb.0:
   1116 ; AVX512-NEXT:    vextractps $2, %xmm0, 8(%rdi)
   1117 ; AVX512-NEXT:    retq
   1118   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
   1119   ret void
   1120 }
   1121 
   1122 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
   1123 
   1124 define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
   1125 ; AVX-LABEL: one_mask_bit_set3:
   1126 ; AVX:       ## %bb.0:
   1127 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1128 ; AVX-NEXT:    vmovlps %xmm0, 16(%rdi)
   1129 ; AVX-NEXT:    vzeroupper
   1130 ; AVX-NEXT:    retq
   1131 ;
   1132 ; AVX512-LABEL: one_mask_bit_set3:
   1133 ; AVX512:       ## %bb.0:
   1134 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1135 ; AVX512-NEXT:    vmovlps %xmm0, 16(%rdi)
   1136 ; AVX512-NEXT:    vzeroupper
   1137 ; AVX512-NEXT:    retq
   1138   call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
   1139   ret void
   1140 }
   1141 
   1142 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
   1143 
   1144 define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
   1145 ; AVX-LABEL: one_mask_bit_set4:
   1146 ; AVX:       ## %bb.0:
   1147 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1148 ; AVX-NEXT:    vmovhpd %xmm0, 24(%rdi)
   1149 ; AVX-NEXT:    vzeroupper
   1150 ; AVX-NEXT:    retq
   1151 ;
   1152 ; AVX512-LABEL: one_mask_bit_set4:
   1153 ; AVX512:       ## %bb.0:
   1154 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1155 ; AVX512-NEXT:    vmovhpd %xmm0, 24(%rdi)
   1156 ; AVX512-NEXT:    vzeroupper
   1157 ; AVX512-NEXT:    retq
   1158   call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
   1159   ret void
   1160 }
   1161 
   1162 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
   1163 
   1164 define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
   1165 ; AVX-LABEL: one_mask_bit_set5:
   1166 ; AVX:       ## %bb.0:
   1167 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
   1168 ; AVX-NEXT:    vmovlps %xmm0, 48(%rdi)
   1169 ; AVX-NEXT:    vzeroupper
   1170 ; AVX-NEXT:    retq
   1171 ;
   1172 ; AVX512-LABEL: one_mask_bit_set5:
   1173 ; AVX512:       ## %bb.0:
   1174 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1175 ; AVX512-NEXT:    vmovlps %xmm0, 48(%rdi)
   1176 ; AVX512-NEXT:    vzeroupper
   1177 ; AVX512-NEXT:    retq
   1178   call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
   1179   ret void
   1180 }
   1181 
   1182 ;  When only one element of the mask is set, reduce to a scalar load.
   1183 
   1184 define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
   1185 ; AVX-LABEL: load_one_mask_bit_set1:
   1186 ; AVX:       ## %bb.0:
   1187 ; AVX-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
   1188 ; AVX-NEXT:    retq
   1189 ;
   1190 ; AVX512-LABEL: load_one_mask_bit_set1:
   1191 ; AVX512:       ## %bb.0:
   1192 ; AVX512-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
   1193 ; AVX512-NEXT:    retq
   1194   %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
   1195   ret <4 x i32> %res
   1196 }
   1197 
   1198 ; Choose a different element to show that the correct address offset is produced.
   1199 
   1200 define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
   1201 ; AVX-LABEL: load_one_mask_bit_set2:
   1202 ; AVX:       ## %bb.0:
   1203 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
   1204 ; AVX-NEXT:    retq
   1205 ;
   1206 ; AVX512-LABEL: load_one_mask_bit_set2:
   1207 ; AVX512:       ## %bb.0:
   1208 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
   1209 ; AVX512-NEXT:    retq
   1210   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
   1211   ret <4 x float> %res
   1212 }
   1213 
   1214 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
   1215 
   1216 define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
   1217 ; AVX1-LABEL: load_one_mask_bit_set3:
   1218 ; AVX1:       ## %bb.0:
   1219 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1220 ; AVX1-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
   1221 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1222 ; AVX1-NEXT:    retq
   1223 ;
   1224 ; AVX2-LABEL: load_one_mask_bit_set3:
   1225 ; AVX2:       ## %bb.0:
   1226 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1227 ; AVX2-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
   1228 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1229 ; AVX2-NEXT:    retq
   1230 ;
   1231 ; AVX512-LABEL: load_one_mask_bit_set3:
   1232 ; AVX512:       ## %bb.0:
   1233 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1234 ; AVX512-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
   1235 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1236 ; AVX512-NEXT:    retq
   1237   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
   1238   ret <4 x i64> %res
   1239 }
   1240 
   1241 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
   1242 
   1243 define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
   1244 ; AVX-LABEL: load_one_mask_bit_set4:
   1245 ; AVX:       ## %bb.0:
   1246 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1247 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
   1248 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1249 ; AVX-NEXT:    retq
   1250 ;
   1251 ; AVX512-LABEL: load_one_mask_bit_set4:
   1252 ; AVX512:       ## %bb.0:
   1253 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1254 ; AVX512-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
   1255 ; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1256 ; AVX512-NEXT:    retq
   1257   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
   1258   ret <4 x double> %res
   1259 }
   1260 
   1261 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
   1262 
   1263 define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
   1264 ; AVX-LABEL: load_one_mask_bit_set5:
   1265 ; AVX:       ## %bb.0:
   1266 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1267 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
   1268 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1269 ; AVX-NEXT:    retq
   1270 ;
   1271 ; AVX512-LABEL: load_one_mask_bit_set5:
   1272 ; AVX512:       ## %bb.0:
   1273 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
   1274 ; AVX512-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
   1275 ; AVX512-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
   1276 ; AVX512-NEXT:    retq
   1277   %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
   1278   ret <8 x double> %res
   1279 }
   1280 
   1281 ; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
   1282 ; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
   1283 
   1284 define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
   1285 ; AVX-LABEL: trunc_mask:
   1286 ; AVX:       ## %bb.0:
   1287 ; AVX-NEXT:    vmaskmovps %xmm0, %xmm2, (%rdi)
   1288 ; AVX-NEXT:    retq
   1289 ;
   1290 ; AVX512F-LABEL: trunc_mask:
   1291 ; AVX512F:       ## %bb.0:
   1292 ; AVX512F-NEXT:    ## kill: def $xmm2 killed $xmm2 def $zmm2
   1293 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
   1294 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1295 ; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
   1296 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
   1297 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
   1298 ; AVX512F-NEXT:    vmovups %zmm0, (%rdi) {%k1}
   1299 ; AVX512F-NEXT:    vzeroupper
   1300 ; AVX512F-NEXT:    retq
   1301 ;
   1302 ; SKX-LABEL: trunc_mask:
   1303 ; SKX:       ## %bb.0:
   1304 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1305 ; SKX-NEXT:    vpcmpgtd %xmm2, %xmm1, %k1
   1306 ; SKX-NEXT:    vmovups %xmm0, (%rdi) {%k1}
   1307 ; SKX-NEXT:    retq
   1308   %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
   1309   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask)
   1310   ret void
   1311 }
   1312 
   1313 ; This needs to be widened to v4i32.
   1314 ; This used to assert in type legalization. PR38436
   1315 ; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask.
   1316 define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
   1317 ; AVX1-LABEL: widen_masked_store:
   1318 ; AVX1:       ## %bb.0:
   1319 ; AVX1-NEXT:    vmovd %edx, %xmm1
   1320 ; AVX1-NEXT:    vmovd %esi, %xmm2
   1321 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   1322 ; AVX1-NEXT:    vmovd %ecx, %xmm2
   1323 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1324 ; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
   1325 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
   1326 ; AVX1-NEXT:    vmaskmovps %xmm0, %xmm1, (%rdi)
   1327 ; AVX1-NEXT:    retq
   1328 ;
   1329 ; AVX2-LABEL: widen_masked_store:
   1330 ; AVX2:       ## %bb.0:
   1331 ; AVX2-NEXT:    vmovd %edx, %xmm1
   1332 ; AVX2-NEXT:    vmovd %esi, %xmm2
   1333 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   1334 ; AVX2-NEXT:    vmovd %ecx, %xmm2
   1335 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1336 ; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
   1337 ; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
   1338 ; AVX2-NEXT:    vpmaskmovd %xmm0, %xmm1, (%rdi)
   1339 ; AVX2-NEXT:    retq
   1340 ;
   1341 ; AVX512F-LABEL: widen_masked_store:
   1342 ; AVX512F:       ## %bb.0:
   1343 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
   1344 ; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
   1345 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
   1346 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
   1347 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1348 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
   1349 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
   1350 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
   1351 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
   1352 ; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
   1353 ; AVX512F-NEXT:    vzeroupper
   1354 ; AVX512F-NEXT:    retq
   1355 ;
   1356 ; SKX-LABEL: widen_masked_store:
   1357 ; SKX:       ## %bb.0:
   1358 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
   1359 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
   1360 ; SKX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
   1361 ; SKX-NEXT:    vmovdqa32 %xmm1, %xmm1 {%k1} {z}
   1362 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1363 ; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
   1364 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
   1365 ; SKX-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
   1366 ; SKX-NEXT:    retq
   1367   call void @llvm.masked.store.v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)
   1368   ret void
   1369 }
   1370 declare void @llvm.masked.store.v3i32(<3 x i32>, <3 x i32>*, i32, <3 x i1>)
   1371 
   1372 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
   1373 declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
   1374 declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
   1375 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
   1376 declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
   1377 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
   1378 declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
   1379 declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
   1380 declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
   1381 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
   1382 declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
   1383 declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
   1384 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
   1385 declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
   1386 declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
   1387 declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
   1388 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
   1389 declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
   1390 declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
   1391 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
   1392 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
   1393 
   1394