Home | History | Annotate | Download | only in avx512-shuffles
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
      3 
      4 define <2 x double> @test_2xdouble_dup_low(<2 x double> %vec) {
      5 ; CHECK-LABEL: test_2xdouble_dup_low:
      6 ; CHECK:       # %bb.0:
      7 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
      8 ; CHECK-NEXT:    retq
      9   %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
     10   ret <2 x double> %res
     11 }
     12 define <2 x double> @test_masked_2xdouble_dup_low_mask0(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
     13 ; CHECK-LABEL: test_masked_2xdouble_dup_low_mask0:
     14 ; CHECK:       # %bb.0:
     15 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
     16 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
     17 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0]
     18 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
     19 ; CHECK-NEXT:    retq
     20   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
     21   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
     22   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
     23   ret <2 x double> %res
     24 }
     25 
     26 define <2 x double> @test_masked_z_2xdouble_dup_low_mask0(<2 x double> %vec, <2 x double> %mask) {
     27 ; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask0:
     28 ; CHECK:       # %bb.0:
     29 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
     30 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
     31 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
     32 ; CHECK-NEXT:    retq
     33   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
     34   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
     35   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
     36   ret <2 x double> %res
     37 }
     38 define <2 x double> @test_masked_2xdouble_dup_low_mask1(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
     39 ; CHECK-LABEL: test_masked_2xdouble_dup_low_mask1:
     40 ; CHECK:       # %bb.0:
     41 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
     42 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
     43 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0]
     44 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
     45 ; CHECK-NEXT:    retq
     46   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
     47   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
     48   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
     49   ret <2 x double> %res
     50 }
     51 
     52 define <2 x double> @test_masked_z_2xdouble_dup_low_mask1(<2 x double> %vec, <2 x double> %mask) {
     53 ; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask1:
     54 ; CHECK:       # %bb.0:
     55 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
     56 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
     57 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
     58 ; CHECK-NEXT:    retq
     59   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
     60   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
     61   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
     62   ret <2 x double> %res
     63 }
     64 define <2 x double> @test_2xdouble_dup_low_mem(<2 x double>* %vp) {
     65 ; CHECK-LABEL: test_2xdouble_dup_low_mem:
     66 ; CHECK:       # %bb.0:
     67 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
     68 ; CHECK-NEXT:    retq
     69   %vec = load <2 x double>, <2 x double>* %vp
     70   %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
     71   ret <2 x double> %res
     72 }
     73 define <2 x double> @test_masked_2xdouble_dup_low_mem_mask0(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
     74 ; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask0:
     75 ; CHECK:       # %bb.0:
     76 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
     77 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
     78 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
     79 ; CHECK-NEXT:    retq
     80   %vec = load <2 x double>, <2 x double>* %vp
     81   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
     82   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
     83   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
     84   ret <2 x double> %res
     85 }
     86 
     87 define <2 x double> @test_masked_z_2xdouble_dup_low_mem_mask0(<2 x double>* %vp, <2 x double> %mask) {
     88 ; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask0:
     89 ; CHECK:       # %bb.0:
     90 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
     91 ; CHECK-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
     92 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
     93 ; CHECK-NEXT:    retq
     94   %vec = load <2 x double>, <2 x double>* %vp
     95   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
     96   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
     97   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
     98   ret <2 x double> %res
     99 }
    100 define <2 x double> @test_masked_2xdouble_dup_low_mem_mask1(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
    101 ; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask1:
    102 ; CHECK:       # %bb.0:
    103 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    104 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
    105 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
    106 ; CHECK-NEXT:    retq
    107   %vec = load <2 x double>, <2 x double>* %vp
    108   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
    109   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
    110   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
    111   ret <2 x double> %res
    112 }
    113 
    114 define <2 x double> @test_masked_z_2xdouble_dup_low_mem_mask1(<2 x double>* %vp, <2 x double> %mask) {
    115 ; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask1:
    116 ; CHECK:       # %bb.0:
    117 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    118 ; CHECK-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
    119 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
    120 ; CHECK-NEXT:    retq
    121   %vec = load <2 x double>, <2 x double>* %vp
    122   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
    123   %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
    124   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
    125   ret <2 x double> %res
    126 }
    127 define <4 x double> @test_4xdouble_dup_low(<4 x double> %vec) {
    128 ; CHECK-LABEL: test_4xdouble_dup_low:
    129 ; CHECK:       # %bb.0:
    130 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    131 ; CHECK-NEXT:    retq
    132   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    133   ret <4 x double> %res
    134 }
    135 define <4 x double> @test_masked_4xdouble_dup_low_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
    136 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask0:
    137 ; CHECK:       # %bb.0:
    138 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    139 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    140 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
    141 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    142 ; CHECK-NEXT:    retq
    143   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    144   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    145   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    146   ret <4 x double> %res
    147 }
    148 
    149 define <4 x double> @test_masked_z_4xdouble_dup_low_mask0(<4 x double> %vec, <4 x double> %mask) {
    150 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask0:
    151 ; CHECK:       # %bb.0:
    152 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    153 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    154 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
    155 ; CHECK-NEXT:    retq
    156   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    157   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    158   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    159   ret <4 x double> %res
    160 }
    161 define <4 x double> @test_masked_4xdouble_dup_low_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
    162 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask1:
    163 ; CHECK:       # %bb.0:
    164 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    165 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    166 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
    167 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    168 ; CHECK-NEXT:    retq
    169   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    170   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    171   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    172   ret <4 x double> %res
    173 }
    174 
    175 define <4 x double> @test_masked_z_4xdouble_dup_low_mask1(<4 x double> %vec, <4 x double> %mask) {
    176 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask1:
    177 ; CHECK:       # %bb.0:
    178 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    179 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    180 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
    181 ; CHECK-NEXT:    retq
    182   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    183   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    184   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    185   ret <4 x double> %res
    186 }
    187 define <4 x double> @test_masked_4xdouble_dup_low_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
    188 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask2:
    189 ; CHECK:       # %bb.0:
    190 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    191 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    192 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
    193 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    194 ; CHECK-NEXT:    retq
    195   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    196   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    197   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    198   ret <4 x double> %res
    199 }
    200 
    201 define <4 x double> @test_masked_z_4xdouble_dup_low_mask2(<4 x double> %vec, <4 x double> %mask) {
    202 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask2:
    203 ; CHECK:       # %bb.0:
    204 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    205 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    206 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
    207 ; CHECK-NEXT:    retq
    208   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    209   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    210   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    211   ret <4 x double> %res
    212 }
    213 define <4 x double> @test_masked_4xdouble_dup_low_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
    214 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask3:
    215 ; CHECK:       # %bb.0:
    216 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    217 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    218 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
    219 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    220 ; CHECK-NEXT:    retq
    221   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    222   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    223   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    224   ret <4 x double> %res
    225 }
    226 
    227 define <4 x double> @test_masked_z_4xdouble_dup_low_mask3(<4 x double> %vec, <4 x double> %mask) {
    228 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask3:
    229 ; CHECK:       # %bb.0:
    230 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    231 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    232 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
    233 ; CHECK-NEXT:    retq
    234   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    235   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    236   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    237   ret <4 x double> %res
    238 }
    239 define <4 x double> @test_masked_4xdouble_dup_low_mask4(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
    240 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask4:
    241 ; CHECK:       # %bb.0:
    242 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    243 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    244 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
    245 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    246 ; CHECK-NEXT:    retq
    247   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    248   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    249   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    250   ret <4 x double> %res
    251 }
    252 
    253 define <4 x double> @test_masked_z_4xdouble_dup_low_mask4(<4 x double> %vec, <4 x double> %mask) {
    254 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask4:
    255 ; CHECK:       # %bb.0:
    256 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    257 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    258 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
    259 ; CHECK-NEXT:    retq
    260   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    261   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    262   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    263   ret <4 x double> %res
    264 }
    265 define <4 x double> @test_4xdouble_dup_low_mem(<4 x double>* %vp) {
    266 ; CHECK-LABEL: test_4xdouble_dup_low_mem:
    267 ; CHECK:       # %bb.0:
    268 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
    269 ; CHECK-NEXT:    retq
    270   %vec = load <4 x double>, <4 x double>* %vp
    271   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    272   ret <4 x double> %res
    273 }
    274 define <4 x double> @test_masked_4xdouble_dup_low_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
    275 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask0:
    276 ; CHECK:       # %bb.0:
    277 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    278 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    279 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
    280 ; CHECK-NEXT:    retq
    281   %vec = load <4 x double>, <4 x double>* %vp
    282   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    283   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    284   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    285   ret <4 x double> %res
    286 }
    287 
    288 define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask0(<4 x double>* %vp, <4 x double> %mask) {
    289 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask0:
    290 ; CHECK:       # %bb.0:
    291 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    292 ; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
    293 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
    294 ; CHECK-NEXT:    retq
    295   %vec = load <4 x double>, <4 x double>* %vp
    296   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    297   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    298   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    299   ret <4 x double> %res
    300 }
    301 define <4 x double> @test_masked_4xdouble_dup_low_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
    302 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask1:
    303 ; CHECK:       # %bb.0:
    304 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    305 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    306 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
    307 ; CHECK-NEXT:    retq
    308   %vec = load <4 x double>, <4 x double>* %vp
    309   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    310   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    311   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    312   ret <4 x double> %res
    313 }
    314 
    315 define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask1(<4 x double>* %vp, <4 x double> %mask) {
    316 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask1:
    317 ; CHECK:       # %bb.0:
    318 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    319 ; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
    320 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
    321 ; CHECK-NEXT:    retq
    322   %vec = load <4 x double>, <4 x double>* %vp
    323   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    324   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    325   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    326   ret <4 x double> %res
    327 }
    328 define <4 x double> @test_masked_4xdouble_dup_low_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
    329 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask2:
    330 ; CHECK:       # %bb.0:
    331 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    332 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    333 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
    334 ; CHECK-NEXT:    retq
    335   %vec = load <4 x double>, <4 x double>* %vp
    336   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    337   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    338   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    339   ret <4 x double> %res
    340 }
    341 
    342 define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask2(<4 x double>* %vp, <4 x double> %mask) {
    343 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask2:
    344 ; CHECK:       # %bb.0:
    345 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    346 ; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
    347 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
    348 ; CHECK-NEXT:    retq
    349   %vec = load <4 x double>, <4 x double>* %vp
    350   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    351   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    352   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    353   ret <4 x double> %res
    354 }
    355 define <4 x double> @test_masked_4xdouble_dup_low_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
    356 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask3:
    357 ; CHECK:       # %bb.0:
    358 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    359 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    360 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
    361 ; CHECK-NEXT:    retq
    362   %vec = load <4 x double>, <4 x double>* %vp
    363   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    364   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    365   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    366   ret <4 x double> %res
    367 }
    368 
    369 define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask3(<4 x double>* %vp, <4 x double> %mask) {
    370 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask3:
    371 ; CHECK:       # %bb.0:
    372 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    373 ; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
    374 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
    375 ; CHECK-NEXT:    retq
    376   %vec = load <4 x double>, <4 x double>* %vp
    377   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    378   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    379   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    380   ret <4 x double> %res
    381 }
    382 define <4 x double> @test_masked_4xdouble_dup_low_mem_mask4(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
    383 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask4:
    384 ; CHECK:       # %bb.0:
    385 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    386 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    387 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
    388 ; CHECK-NEXT:    retq
    389   %vec = load <4 x double>, <4 x double>* %vp
    390   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    391   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    392   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
    393   ret <4 x double> %res
    394 }
    395 
    396 define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask4(<4 x double>* %vp, <4 x double> %mask) {
    397 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask4:
    398 ; CHECK:       # %bb.0:
    399 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    400 ; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
    401 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
    402 ; CHECK-NEXT:    retq
    403   %vec = load <4 x double>, <4 x double>* %vp
    404   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    405   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    406   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    407   ret <4 x double> %res
    408 }
    409 define <8 x double> @test_8xdouble_dup_low(<8 x double> %vec) {
    410 ; CHECK-LABEL: test_8xdouble_dup_low:
    411 ; CHECK:       # %bb.0:
    412 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
    413 ; CHECK-NEXT:    retq
    414   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    415   ret <8 x double> %res
    416 }
    417 define <8 x double> @test_masked_8xdouble_dup_low_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
    418 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask0:
    419 ; CHECK:       # %bb.0:
    420 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    421 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    422 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
    423 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    424 ; CHECK-NEXT:    retq
    425   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    426   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    427   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    428   ret <8 x double> %res
    429 }
    430 
    431 define <8 x double> @test_masked_z_8xdouble_dup_low_mask0(<8 x double> %vec, <8 x double> %mask) {
    432 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask0:
    433 ; CHECK:       # %bb.0:
    434 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    435 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    436 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
    437 ; CHECK-NEXT:    retq
    438   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    439   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    440   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    441   ret <8 x double> %res
    442 }
    443 define <8 x double> @test_masked_8xdouble_dup_low_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
    444 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask1:
    445 ; CHECK:       # %bb.0:
    446 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    447 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    448 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
    449 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    450 ; CHECK-NEXT:    retq
    451   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    452   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    453   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    454   ret <8 x double> %res
    455 }
    456 
    457 define <8 x double> @test_masked_z_8xdouble_dup_low_mask1(<8 x double> %vec, <8 x double> %mask) {
    458 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask1:
    459 ; CHECK:       # %bb.0:
    460 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    461 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    462 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
    463 ; CHECK-NEXT:    retq
    464   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    465   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    466   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    467   ret <8 x double> %res
    468 }
    469 define <8 x double> @test_masked_8xdouble_dup_low_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
    470 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask2:
    471 ; CHECK:       # %bb.0:
    472 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    473 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    474 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
    475 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    476 ; CHECK-NEXT:    retq
    477   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    478   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    479   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    480   ret <8 x double> %res
    481 }
    482 
    483 define <8 x double> @test_masked_z_8xdouble_dup_low_mask2(<8 x double> %vec, <8 x double> %mask) {
    484 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask2:
    485 ; CHECK:       # %bb.0:
    486 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    487 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    488 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
    489 ; CHECK-NEXT:    retq
    490   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    491   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    492   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    493   ret <8 x double> %res
    494 }
    495 define <8 x double> @test_masked_8xdouble_dup_low_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
    496 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask3:
    497 ; CHECK:       # %bb.0:
    498 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    499 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    500 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
    501 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    502 ; CHECK-NEXT:    retq
    503   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    504   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    505   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    506   ret <8 x double> %res
    507 }
    508 
    509 define <8 x double> @test_masked_z_8xdouble_dup_low_mask3(<8 x double> %vec, <8 x double> %mask) {
    510 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask3:
    511 ; CHECK:       # %bb.0:
    512 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    513 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    514 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
    515 ; CHECK-NEXT:    retq
    516   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    517   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    518   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    519   ret <8 x double> %res
    520 }
    521 define <8 x double> @test_masked_8xdouble_dup_low_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
    522 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask4:
    523 ; CHECK:       # %bb.0:
    524 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    525 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    526 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
    527 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    528 ; CHECK-NEXT:    retq
    529   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    530   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    531   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    532   ret <8 x double> %res
    533 }
    534 
    535 define <8 x double> @test_masked_z_8xdouble_dup_low_mask4(<8 x double> %vec, <8 x double> %mask) {
    536 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask4:
    537 ; CHECK:       # %bb.0:
    538 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    539 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    540 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
    541 ; CHECK-NEXT:    retq
    542   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    543   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    544   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    545   ret <8 x double> %res
    546 }
    547 define <8 x double> @test_8xdouble_dup_low_mem(<8 x double>* %vp) {
    548 ; CHECK-LABEL: test_8xdouble_dup_low_mem:
    549 ; CHECK:       # %bb.0:
    550 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
    551 ; CHECK-NEXT:    retq
    552   %vec = load <8 x double>, <8 x double>* %vp
    553   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    554   ret <8 x double> %res
    555 }
    556 define <8 x double> @test_masked_8xdouble_dup_low_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
    557 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask0:
    558 ; CHECK:       # %bb.0:
    559 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    560 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    561 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
    562 ; CHECK-NEXT:    retq
    563   %vec = load <8 x double>, <8 x double>* %vp
    564   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    565   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    566   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    567   ret <8 x double> %res
    568 }
    569 
    570 define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask0(<8 x double>* %vp, <8 x double> %mask) {
    571 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask0:
    572 ; CHECK:       # %bb.0:
    573 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    574 ; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
    575 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
    576 ; CHECK-NEXT:    retq
    577   %vec = load <8 x double>, <8 x double>* %vp
    578   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    579   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    580   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    581   ret <8 x double> %res
    582 }
    583 define <8 x double> @test_masked_8xdouble_dup_low_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
    584 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask1:
    585 ; CHECK:       # %bb.0:
    586 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    587 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    588 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
    589 ; CHECK-NEXT:    retq
    590   %vec = load <8 x double>, <8 x double>* %vp
    591   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    592   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    593   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    594   ret <8 x double> %res
    595 }
    596 
    597 define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask1(<8 x double>* %vp, <8 x double> %mask) {
    598 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask1:
    599 ; CHECK:       # %bb.0:
    600 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    601 ; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
    602 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
    603 ; CHECK-NEXT:    retq
    604   %vec = load <8 x double>, <8 x double>* %vp
    605   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    606   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    607   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    608   ret <8 x double> %res
    609 }
    610 define <8 x double> @test_masked_8xdouble_dup_low_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
    611 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask2:
    612 ; CHECK:       # %bb.0:
    613 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    614 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    615 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
    616 ; CHECK-NEXT:    retq
    617   %vec = load <8 x double>, <8 x double>* %vp
    618   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    619   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    620   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    621   ret <8 x double> %res
    622 }
    623 
    624 define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask2(<8 x double>* %vp, <8 x double> %mask) {
    625 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask2:
    626 ; CHECK:       # %bb.0:
    627 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    628 ; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
    629 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
    630 ; CHECK-NEXT:    retq
    631   %vec = load <8 x double>, <8 x double>* %vp
    632   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    633   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    634   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    635   ret <8 x double> %res
    636 }
    637 define <8 x double> @test_masked_8xdouble_dup_low_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
    638 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask3:
    639 ; CHECK:       # %bb.0:
    640 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    641 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    642 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
    643 ; CHECK-NEXT:    retq
    644   %vec = load <8 x double>, <8 x double>* %vp
    645   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    646   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    647   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    648   ret <8 x double> %res
    649 }
    650 
    651 define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask3(<8 x double>* %vp, <8 x double> %mask) {
    652 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask3:
    653 ; CHECK:       # %bb.0:
    654 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    655 ; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
    656 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
    657 ; CHECK-NEXT:    retq
    658   %vec = load <8 x double>, <8 x double>* %vp
    659   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    660   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    661   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    662   ret <8 x double> %res
    663 }
    664 define <8 x double> @test_masked_8xdouble_dup_low_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
    665 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask4:
    666 ; CHECK:       # %bb.0:
    667 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    668 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    669 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
    670 ; CHECK-NEXT:    retq
    671   %vec = load <8 x double>, <8 x double>* %vp
    672   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    673   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    674   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
    675   ret <8 x double> %res
    676 }
    677 
    678 define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask4(<8 x double>* %vp, <8 x double> %mask) {
    679 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask4:
    680 ; CHECK:       # %bb.0:
    681 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    682 ; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
    683 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
    684 ; CHECK-NEXT:    retq
    685   %vec = load <8 x double>, <8 x double>* %vp
    686   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    687   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    688   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    689   ret <8 x double> %res
    690 }
    691 define <4 x float> @test_4xfloat_dup_low(<4 x float> %vec) {
    692 ; CHECK-LABEL: test_4xfloat_dup_low:
    693 ; CHECK:       # %bb.0:
    694 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
    695 ; CHECK-NEXT:    retq
    696   %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    697   ret <4 x float> %res
    698 }
    699 define <4 x float> @test_masked_4xfloat_dup_low_mask0(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
    700 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask0:
    701 ; CHECK:       # %bb.0:
    702 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    703 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
    704 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
    705 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
    706 ; CHECK-NEXT:    retq
    707   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    708   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    709   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    710   ret <4 x float> %res
    711 }
    712 
    713 define <4 x float> @test_masked_z_4xfloat_dup_low_mask0(<4 x float> %vec, <4 x float> %mask) {
    714 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask0:
    715 ; CHECK:       # %bb.0:
    716 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    717 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    718 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
    719 ; CHECK-NEXT:    retq
    720   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    721   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    722   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    723   ret <4 x float> %res
    724 }
    725 define <4 x float> @test_masked_4xfloat_dup_low_mask1(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
    726 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask1:
    727 ; CHECK:       # %bb.0:
    728 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    729 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
    730 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
    731 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
    732 ; CHECK-NEXT:    retq
    733   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    734   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    735   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    736   ret <4 x float> %res
    737 }
    738 
    739 define <4 x float> @test_masked_z_4xfloat_dup_low_mask1(<4 x float> %vec, <4 x float> %mask) {
    740 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask1:
    741 ; CHECK:       # %bb.0:
    742 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    743 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    744 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
    745 ; CHECK-NEXT:    retq
    746   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    747   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    748   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    749   ret <4 x float> %res
    750 }
    751 define <4 x float> @test_masked_4xfloat_dup_low_mask2(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
    752 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask2:
    753 ; CHECK:       # %bb.0:
    754 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    755 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
    756 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
    757 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
    758 ; CHECK-NEXT:    retq
    759   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    760   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    761   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    762   ret <4 x float> %res
    763 }
    764 
    765 define <4 x float> @test_masked_z_4xfloat_dup_low_mask2(<4 x float> %vec, <4 x float> %mask) {
    766 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask2:
    767 ; CHECK:       # %bb.0:
    768 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    769 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    770 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
    771 ; CHECK-NEXT:    retq
    772   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    773   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    774   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    775   ret <4 x float> %res
    776 }
    777 define <4 x float> @test_masked_4xfloat_dup_low_mask3(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
    778 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask3:
    779 ; CHECK:       # %bb.0:
    780 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    781 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
    782 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
    783 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
    784 ; CHECK-NEXT:    retq
    785   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    786   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    787   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    788   ret <4 x float> %res
    789 }
    790 
    791 define <4 x float> @test_masked_z_4xfloat_dup_low_mask3(<4 x float> %vec, <4 x float> %mask) {
    792 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask3:
    793 ; CHECK:       # %bb.0:
    794 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    795 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    796 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
    797 ; CHECK-NEXT:    retq
    798   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    799   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    800   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    801   ret <4 x float> %res
    802 }
    803 define <4 x float> @test_masked_4xfloat_dup_low_mask4(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
    804 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask4:
    805 ; CHECK:       # %bb.0:
    806 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    807 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
    808 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
    809 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
    810 ; CHECK-NEXT:    retq
    811   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    812   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    813   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    814   ret <4 x float> %res
    815 }
    816 
    817 define <4 x float> @test_masked_z_4xfloat_dup_low_mask4(<4 x float> %vec, <4 x float> %mask) {
    818 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask4:
    819 ; CHECK:       # %bb.0:
    820 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    821 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    822 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
    823 ; CHECK-NEXT:    retq
    824   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    825   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    826   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    827   ret <4 x float> %res
    828 }
    829 define <4 x float> @test_4xfloat_dup_low_mem(<4 x float>* %vp) {
    830 ; CHECK-LABEL: test_4xfloat_dup_low_mem:
    831 ; CHECK:       # %bb.0:
    832 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = mem[0,0,2,2]
    833 ; CHECK-NEXT:    retq
    834   %vec = load <4 x float>, <4 x float>* %vp
    835   %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    836   ret <4 x float> %res
    837 }
    838 define <4 x float> @test_masked_4xfloat_dup_low_mem_mask0(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
    839 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask0:
    840 ; CHECK:       # %bb.0:
    841 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    842 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    843 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
    844 ; CHECK-NEXT:    retq
    845   %vec = load <4 x float>, <4 x float>* %vp
    846   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    847   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    848   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    849   ret <4 x float> %res
    850 }
    851 
    852 define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask0(<4 x float>* %vp, <4 x float> %mask) {
    853 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask0:
    854 ; CHECK:       # %bb.0:
    855 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    856 ; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
    857 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
    858 ; CHECK-NEXT:    retq
    859   %vec = load <4 x float>, <4 x float>* %vp
    860   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    861   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    862   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    863   ret <4 x float> %res
    864 }
    865 define <4 x float> @test_masked_4xfloat_dup_low_mem_mask1(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
    866 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask1:
    867 ; CHECK:       # %bb.0:
    868 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    869 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    870 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
    871 ; CHECK-NEXT:    retq
    872   %vec = load <4 x float>, <4 x float>* %vp
    873   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    874   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    875   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    876   ret <4 x float> %res
    877 }
    878 
    879 define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask1(<4 x float>* %vp, <4 x float> %mask) {
    880 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask1:
    881 ; CHECK:       # %bb.0:
    882 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    883 ; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
    884 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
    885 ; CHECK-NEXT:    retq
    886   %vec = load <4 x float>, <4 x float>* %vp
    887   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    888   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    889   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    890   ret <4 x float> %res
    891 }
    892 define <4 x float> @test_masked_4xfloat_dup_low_mem_mask2(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
    893 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask2:
    894 ; CHECK:       # %bb.0:
    895 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    896 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    897 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
    898 ; CHECK-NEXT:    retq
    899   %vec = load <4 x float>, <4 x float>* %vp
    900   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    901   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    902   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    903   ret <4 x float> %res
    904 }
    905 
    906 define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask2(<4 x float>* %vp, <4 x float> %mask) {
    907 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask2:
    908 ; CHECK:       # %bb.0:
    909 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    910 ; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
    911 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
    912 ; CHECK-NEXT:    retq
    913   %vec = load <4 x float>, <4 x float>* %vp
    914   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    915   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    916   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    917   ret <4 x float> %res
    918 }
    919 define <4 x float> @test_masked_4xfloat_dup_low_mem_mask3(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
    920 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask3:
    921 ; CHECK:       # %bb.0:
    922 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    923 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    924 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
    925 ; CHECK-NEXT:    retq
    926   %vec = load <4 x float>, <4 x float>* %vp
    927   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    928   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    929   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    930   ret <4 x float> %res
    931 }
    932 
    933 define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask3(<4 x float>* %vp, <4 x float> %mask) {
    934 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask3:
    935 ; CHECK:       # %bb.0:
    936 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    937 ; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
    938 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
    939 ; CHECK-NEXT:    retq
    940   %vec = load <4 x float>, <4 x float>* %vp
    941   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    942   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    943   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    944   ret <4 x float> %res
    945 }
    946 define <4 x float> @test_masked_4xfloat_dup_low_mem_mask4(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
    947 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask4:
    948 ; CHECK:       # %bb.0:
    949 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    950 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
    951 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
    952 ; CHECK-NEXT:    retq
    953   %vec = load <4 x float>, <4 x float>* %vp
    954   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    955   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    956   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
    957   ret <4 x float> %res
    958 }
    959 
    960 define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask4(<4 x float>* %vp, <4 x float> %mask) {
    961 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask4:
    962 ; CHECK:       # %bb.0:
    963 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    964 ; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
    965 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
    966 ; CHECK-NEXT:    retq
    967   %vec = load <4 x float>, <4 x float>* %vp
    968   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    969   %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
    970   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
    971   ret <4 x float> %res
    972 }
    973 define <8 x float> @test_8xfloat_dup_low(<8 x float> %vec) {
    974 ; CHECK-LABEL: test_8xfloat_dup_low:
    975 ; CHECK:       # %bb.0:
    976 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
    977 ; CHECK-NEXT:    retq
    978   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    979   ret <8 x float> %res
    980 }
    981 define <8 x float> @test_masked_8xfloat_dup_low_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
    982 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask0:
    983 ; CHECK:       # %bb.0:
    984 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    985 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
    986 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
    987 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
    988 ; CHECK-NEXT:    retq
    989   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    990   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    991   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
    992   ret <8 x float> %res
    993 }
    994 
    995 define <8 x float> @test_masked_z_8xfloat_dup_low_mask0(<8 x float> %vec, <8 x float> %mask) {
    996 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask0:
    997 ; CHECK:       # %bb.0:
    998 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    999 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1000 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
   1001 ; CHECK-NEXT:    retq
   1002   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1003   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1004   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1005   ret <8 x float> %res
   1006 }
   1007 define <8 x float> @test_masked_8xfloat_dup_low_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
   1008 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask1:
   1009 ; CHECK:       # %bb.0:
   1010 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1011 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
   1012 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
   1013 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
   1014 ; CHECK-NEXT:    retq
   1015   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1016   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1017   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1018   ret <8 x float> %res
   1019 }
   1020 
   1021 define <8 x float> @test_masked_z_8xfloat_dup_low_mask1(<8 x float> %vec, <8 x float> %mask) {
   1022 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask1:
   1023 ; CHECK:       # %bb.0:
   1024 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1025 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1026 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
   1027 ; CHECK-NEXT:    retq
   1028   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1029   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1030   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1031   ret <8 x float> %res
   1032 }
   1033 define <8 x float> @test_masked_8xfloat_dup_low_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
   1034 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask2:
   1035 ; CHECK:       # %bb.0:
   1036 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1037 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
   1038 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
   1039 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
   1040 ; CHECK-NEXT:    retq
   1041   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1042   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1043   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1044   ret <8 x float> %res
   1045 }
   1046 
   1047 define <8 x float> @test_masked_z_8xfloat_dup_low_mask2(<8 x float> %vec, <8 x float> %mask) {
   1048 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask2:
   1049 ; CHECK:       # %bb.0:
   1050 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1051 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1052 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
   1053 ; CHECK-NEXT:    retq
   1054   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1055   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1056   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1057   ret <8 x float> %res
   1058 }
   1059 define <8 x float> @test_masked_8xfloat_dup_low_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
   1060 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask3:
   1061 ; CHECK:       # %bb.0:
   1062 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1063 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
   1064 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
   1065 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
   1066 ; CHECK-NEXT:    retq
   1067   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1068   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1069   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1070   ret <8 x float> %res
   1071 }
   1072 
   1073 define <8 x float> @test_masked_z_8xfloat_dup_low_mask3(<8 x float> %vec, <8 x float> %mask) {
   1074 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask3:
   1075 ; CHECK:       # %bb.0:
   1076 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1077 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1078 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
   1079 ; CHECK-NEXT:    retq
   1080   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1081   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1082   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1083   ret <8 x float> %res
   1084 }
   1085 define <8 x float> @test_masked_8xfloat_dup_low_mask4(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
   1086 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask4:
   1087 ; CHECK:       # %bb.0:
   1088 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1089 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
   1090 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
   1091 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
   1092 ; CHECK-NEXT:    retq
   1093   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1094   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1095   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1096   ret <8 x float> %res
   1097 }
   1098 
   1099 define <8 x float> @test_masked_z_8xfloat_dup_low_mask4(<8 x float> %vec, <8 x float> %mask) {
   1100 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask4:
   1101 ; CHECK:       # %bb.0:
   1102 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1103 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1104 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
   1105 ; CHECK-NEXT:    retq
   1106   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1107   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1108   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1109   ret <8 x float> %res
   1110 }
   1111 define <8 x float> @test_8xfloat_dup_low_mem(<8 x float>* %vp) {
   1112 ; CHECK-LABEL: test_8xfloat_dup_low_mem:
   1113 ; CHECK:       # %bb.0:
   1114 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = mem[0,0,2,2,4,4,6,6]
   1115 ; CHECK-NEXT:    retq
   1116   %vec = load <8 x float>, <8 x float>* %vp
   1117   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1118   ret <8 x float> %res
   1119 }
   1120 define <8 x float> @test_masked_8xfloat_dup_low_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
   1121 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask0:
   1122 ; CHECK:       # %bb.0:
   1123 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1124 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1125 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
   1126 ; CHECK-NEXT:    retq
   1127   %vec = load <8 x float>, <8 x float>* %vp
   1128   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1129   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1130   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1131   ret <8 x float> %res
   1132 }
   1133 
   1134 define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
   1135 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask0:
   1136 ; CHECK:       # %bb.0:
   1137 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1138 ; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
   1139 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
   1140 ; CHECK-NEXT:    retq
   1141   %vec = load <8 x float>, <8 x float>* %vp
   1142   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1143   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1144   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1145   ret <8 x float> %res
   1146 }
   1147 define <8 x float> @test_masked_8xfloat_dup_low_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
   1148 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask1:
   1149 ; CHECK:       # %bb.0:
   1150 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1151 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1152 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
   1153 ; CHECK-NEXT:    retq
   1154   %vec = load <8 x float>, <8 x float>* %vp
   1155   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1156   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1157   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1158   ret <8 x float> %res
   1159 }
   1160 
   1161 define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
   1162 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask1:
   1163 ; CHECK:       # %bb.0:
   1164 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1165 ; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
   1166 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
   1167 ; CHECK-NEXT:    retq
   1168   %vec = load <8 x float>, <8 x float>* %vp
   1169   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1170   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1171   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1172   ret <8 x float> %res
   1173 }
   1174 define <8 x float> @test_masked_8xfloat_dup_low_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
   1175 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask2:
   1176 ; CHECK:       # %bb.0:
   1177 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1178 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1179 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
   1180 ; CHECK-NEXT:    retq
   1181   %vec = load <8 x float>, <8 x float>* %vp
   1182   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1183   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1184   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1185   ret <8 x float> %res
   1186 }
   1187 
   1188 define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
   1189 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask2:
   1190 ; CHECK:       # %bb.0:
   1191 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1192 ; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
   1193 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
   1194 ; CHECK-NEXT:    retq
   1195   %vec = load <8 x float>, <8 x float>* %vp
   1196   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1197   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1198   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1199   ret <8 x float> %res
   1200 }
   1201 define <8 x float> @test_masked_8xfloat_dup_low_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
   1202 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask3:
   1203 ; CHECK:       # %bb.0:
   1204 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1205 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1206 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
   1207 ; CHECK-NEXT:    retq
   1208   %vec = load <8 x float>, <8 x float>* %vp
   1209   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1210   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1211   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1212   ret <8 x float> %res
   1213 }
   1214 
   1215 define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
   1216 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask3:
   1217 ; CHECK:       # %bb.0:
   1218 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1219 ; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
   1220 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
   1221 ; CHECK-NEXT:    retq
   1222   %vec = load <8 x float>, <8 x float>* %vp
   1223   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1224   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1225   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1226   ret <8 x float> %res
   1227 }
   1228 define <8 x float> @test_masked_8xfloat_dup_low_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
   1229 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask4:
   1230 ; CHECK:       # %bb.0:
   1231 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1232 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
   1233 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
   1234 ; CHECK-NEXT:    retq
   1235   %vec = load <8 x float>, <8 x float>* %vp
   1236   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1237   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1238   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   1239   ret <8 x float> %res
   1240 }
   1241 
   1242 define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask4(<8 x float>* %vp, <8 x float> %mask) {
   1243 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask4:
   1244 ; CHECK:       # %bb.0:
   1245 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1246 ; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
   1247 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
   1248 ; CHECK-NEXT:    retq
   1249   %vec = load <8 x float>, <8 x float>* %vp
   1250   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1251   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
   1252   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   1253   ret <8 x float> %res
   1254 }
   1255 define <16 x float> @test_16xfloat_dup_low(<16 x float> %vec) {
   1256 ; CHECK-LABEL: test_16xfloat_dup_low:
   1257 ; CHECK:       # %bb.0:
   1258 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1259 ; CHECK-NEXT:    retq
   1260   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1261   ret <16 x float> %res
   1262 }
   1263 define <16 x float> @test_masked_16xfloat_dup_low_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
   1264 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask0:
   1265 ; CHECK:       # %bb.0:
   1266 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1267 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
   1268 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1269 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1270 ; CHECK-NEXT:    retq
   1271   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1272   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1273   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1274   ret <16 x float> %res
   1275 }
   1276 
   1277 define <16 x float> @test_masked_z_16xfloat_dup_low_mask0(<16 x float> %vec, <16 x float> %mask) {
   1278 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask0:
   1279 ; CHECK:       # %bb.0:
   1280 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1281 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1282 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1283 ; CHECK-NEXT:    retq
   1284   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1285   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1286   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1287   ret <16 x float> %res
   1288 }
   1289 define <16 x float> @test_masked_16xfloat_dup_low_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
   1290 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask1:
   1291 ; CHECK:       # %bb.0:
   1292 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1293 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
   1294 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1295 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1296 ; CHECK-NEXT:    retq
   1297   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1298   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1299   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1300   ret <16 x float> %res
   1301 }
   1302 
   1303 define <16 x float> @test_masked_z_16xfloat_dup_low_mask1(<16 x float> %vec, <16 x float> %mask) {
   1304 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask1:
   1305 ; CHECK:       # %bb.0:
   1306 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1307 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1308 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1309 ; CHECK-NEXT:    retq
   1310   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1311   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1312   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1313   ret <16 x float> %res
   1314 }
   1315 define <16 x float> @test_masked_16xfloat_dup_low_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
   1316 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask2:
   1317 ; CHECK:       # %bb.0:
   1318 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1319 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
   1320 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1321 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1322 ; CHECK-NEXT:    retq
   1323   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1324   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1325   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1326   ret <16 x float> %res
   1327 }
   1328 
   1329 define <16 x float> @test_masked_z_16xfloat_dup_low_mask2(<16 x float> %vec, <16 x float> %mask) {
   1330 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask2:
   1331 ; CHECK:       # %bb.0:
   1332 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1333 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1334 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1335 ; CHECK-NEXT:    retq
   1336   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1337   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1338   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1339   ret <16 x float> %res
   1340 }
   1341 define <16 x float> @test_masked_16xfloat_dup_low_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
   1342 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask3:
   1343 ; CHECK:       # %bb.0:
   1344 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1345 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
   1346 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1347 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1348 ; CHECK-NEXT:    retq
   1349   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1350   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1351   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1352   ret <16 x float> %res
   1353 }
   1354 
   1355 define <16 x float> @test_masked_z_16xfloat_dup_low_mask3(<16 x float> %vec, <16 x float> %mask) {
   1356 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask3:
   1357 ; CHECK:       # %bb.0:
   1358 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1359 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1360 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1361 ; CHECK-NEXT:    retq
   1362   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1363   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1364   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1365   ret <16 x float> %res
   1366 }
   1367 define <16 x float> @test_masked_16xfloat_dup_low_mask4(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
   1368 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask4:
   1369 ; CHECK:       # %bb.0:
   1370 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   1371 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
   1372 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1373 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
   1374 ; CHECK-NEXT:    retq
   1375   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1376   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1377   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1378   ret <16 x float> %res
   1379 }
   1380 
   1381 define <16 x float> @test_masked_z_16xfloat_dup_low_mask4(<16 x float> %vec, <16 x float> %mask) {
   1382 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask4:
   1383 ; CHECK:       # %bb.0:
   1384 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1385 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1386 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1387 ; CHECK-NEXT:    retq
   1388   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1389   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1390   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1391   ret <16 x float> %res
   1392 }
   1393 define <16 x float> @test_16xfloat_dup_low_mem(<16 x float>* %vp) {
   1394 ; CHECK-LABEL: test_16xfloat_dup_low_mem:
   1395 ; CHECK:       # %bb.0:
   1396 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1397 ; CHECK-NEXT:    retq
   1398   %vec = load <16 x float>, <16 x float>* %vp
   1399   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1400   ret <16 x float> %res
   1401 }
   1402 define <16 x float> @test_masked_16xfloat_dup_low_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
   1403 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask0:
   1404 ; CHECK:       # %bb.0:
   1405 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1406 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1407 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1408 ; CHECK-NEXT:    retq
   1409   %vec = load <16 x float>, <16 x float>* %vp
   1410   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1411   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1412   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1413   ret <16 x float> %res
   1414 }
   1415 
   1416 define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
   1417 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask0:
   1418 ; CHECK:       # %bb.0:
   1419 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1420 ; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
   1421 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1422 ; CHECK-NEXT:    retq
   1423   %vec = load <16 x float>, <16 x float>* %vp
   1424   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1425   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1426   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1427   ret <16 x float> %res
   1428 }
   1429 define <16 x float> @test_masked_16xfloat_dup_low_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
   1430 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask1:
   1431 ; CHECK:       # %bb.0:
   1432 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1433 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1434 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1435 ; CHECK-NEXT:    retq
   1436   %vec = load <16 x float>, <16 x float>* %vp
   1437   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1438   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1439   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1440   ret <16 x float> %res
   1441 }
   1442 
   1443 define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
   1444 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask1:
   1445 ; CHECK:       # %bb.0:
   1446 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1447 ; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
   1448 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1449 ; CHECK-NEXT:    retq
   1450   %vec = load <16 x float>, <16 x float>* %vp
   1451   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1452   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1453   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1454   ret <16 x float> %res
   1455 }
   1456 define <16 x float> @test_masked_16xfloat_dup_low_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
   1457 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask2:
   1458 ; CHECK:       # %bb.0:
   1459 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1460 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1461 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1462 ; CHECK-NEXT:    retq
   1463   %vec = load <16 x float>, <16 x float>* %vp
   1464   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1465   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1466   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1467   ret <16 x float> %res
   1468 }
   1469 
   1470 define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
   1471 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask2:
   1472 ; CHECK:       # %bb.0:
   1473 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1474 ; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
   1475 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1476 ; CHECK-NEXT:    retq
   1477   %vec = load <16 x float>, <16 x float>* %vp
   1478   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1479   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1480   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1481   ret <16 x float> %res
   1482 }
   1483 define <16 x float> @test_masked_16xfloat_dup_low_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
   1484 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask3:
   1485 ; CHECK:       # %bb.0:
   1486 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1487 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1488 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1489 ; CHECK-NEXT:    retq
   1490   %vec = load <16 x float>, <16 x float>* %vp
   1491   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1492   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1493   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1494   ret <16 x float> %res
   1495 }
   1496 
   1497 define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
   1498 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask3:
   1499 ; CHECK:       # %bb.0:
   1500 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1501 ; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
   1502 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1503 ; CHECK-NEXT:    retq
   1504   %vec = load <16 x float>, <16 x float>* %vp
   1505   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1506   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1507   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1508   ret <16 x float> %res
   1509 }
   1510 define <16 x float> @test_masked_16xfloat_dup_low_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
   1511 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask4:
   1512 ; CHECK:       # %bb.0:
   1513 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
   1514 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
   1515 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1516 ; CHECK-NEXT:    retq
   1517   %vec = load <16 x float>, <16 x float>* %vp
   1518   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1519   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1520   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   1521   ret <16 x float> %res
   1522 }
   1523 
   1524 define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask4(<16 x float>* %vp, <16 x float> %mask) {
   1525 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask4:
   1526 ; CHECK:       # %bb.0:
   1527 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1528 ; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
   1529 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
   1530 ; CHECK-NEXT:    retq
   1531   %vec = load <16 x float>, <16 x float>* %vp
   1532   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   1533   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
   1534   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   1535   ret <16 x float> %res
   1536 }
   1537