Home | History | Annotate | Download | only in avx512-shuffles
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
      3 
      4 ; FIXME: 128-bit shuffles of 256-bit vectors cases should be fixed by PR34359
      5 
      6 define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
      7 ; CHECK-LABEL: test_8xfloat_shuff_mask0:
      8 ; CHECK:       # %bb.0:
      9 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
     10 ; CHECK-NEXT:    retq
     11   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
     12   ret <8 x float> %res
     13 }
     14 define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
     15 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
     16 ; CHECK:       # %bb.0:
     17 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
     18 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
     19 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
     20 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
     21 ; CHECK-NEXT:    retq
     22   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
     23   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
     24   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
     25   ret <8 x float> %res
     26 }
     27 
     28 define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
     29 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
     30 ; CHECK:       # %bb.0:
     31 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
     32 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
     33 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
     34 ; CHECK-NEXT:    retq
     35   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
     36   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
     37   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
     38   ret <8 x float> %res
     39 }
     40 define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
     41 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
     42 ; CHECK:       # %bb.0:
     43 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
     44 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
     45 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
     46 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
     47 ; CHECK-NEXT:    retq
     48   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
     49   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
     50   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
     51   ret <8 x float> %res
     52 }
     53 
     54 define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
     55 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
     56 ; CHECK:       # %bb.0:
     57 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
     58 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
     59 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
     60 ; CHECK-NEXT:    retq
     61   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
     62   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
     63   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
     64   ret <8 x float> %res
     65 }
     66 define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
     67 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
     68 ; CHECK:       # %bb.0:
     69 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
     70 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
     71 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
     72 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
     73 ; CHECK-NEXT:    retq
     74   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
     75   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
     76   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
     77   ret <8 x float> %res
     78 }
     79 
     80 define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
     81 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
     82 ; CHECK:       # %bb.0:
     83 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
     84 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
     85 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
     86 ; CHECK-NEXT:    retq
     87   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
     88   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
     89   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
     90   ret <8 x float> %res
     91 }
     92 define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
     93 ; CHECK-LABEL: test_8xfloat_shuff_mask3:
     94 ; CHECK:       # %bb.0:
     95 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
     96 ; CHECK-NEXT:    retq
     97   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
     98   ret <8 x float> %res
     99 }
    100 define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
    101 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
    102 ; CHECK:       # %bb.0:
    103 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
    104 ; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
    105 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
    106 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
    107 ; CHECK-NEXT:    retq
    108   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    109   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    110   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
    111   ret <8 x float> %res
    112 }
    113 
    114 define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
    115 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
    116 ; CHECK:       # %bb.0:
    117 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    118 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
    119 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
    120 ; CHECK-NEXT:    retq
    121   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    122   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    123   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
    124   ret <8 x float> %res
    125 }
    126 define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
    127 ; CHECK-LABEL: test_8xfloat_shuff_mem_mask0:
    128 ; CHECK:       # %bb.0:
    129 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    130 ; CHECK-NEXT:    retq
    131   %vec2 = load <8 x float>, <8 x float>* %vec2p
    132   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    133   ret <8 x float> %res
    134 }
    135 define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
    136 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
    137 ; CHECK:       # %bb.0:
    138 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    139 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
    140 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
    141 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
    142 ; CHECK-NEXT:    retq
    143   %vec2 = load <8 x float>, <8 x float>* %vec2p
    144   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    145   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    146   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
    147   ret <8 x float> %res
    148 }
    149 
    150 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
    151 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
    152 ; CHECK:       # %bb.0:
    153 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    154 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
    155 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
    156 ; CHECK-NEXT:    retq
    157   %vec2 = load <8 x float>, <8 x float>* %vec2p
    158   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    159   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    160   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
    161   ret <8 x float> %res
    162 }
    163 
    164 define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
    165 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
    166 ; CHECK:       # %bb.0:
    167 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    168 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
    169 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
    170 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
    171 ; CHECK-NEXT:    retq
    172   %vec2 = load <8 x float>, <8 x float>* %vec2p
    173   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    174   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    175   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
    176   ret <8 x float> %res
    177 }
    178 
    179 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
    180 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
    181 ; CHECK:       # %bb.0:
    182 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    183 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
    184 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
    185 ; CHECK-NEXT:    retq
    186   %vec2 = load <8 x float>, <8 x float>* %vec2p
    187   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    188   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    189   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
    190   ret <8 x float> %res
    191 }
    192 
    193 define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
    194 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
    195 ; CHECK:       # %bb.0:
    196 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    197 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
    198 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
    199 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
    200 ; CHECK-NEXT:    retq
    201   %vec2 = load <8 x float>, <8 x float>* %vec2p
    202   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    203   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    204   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
    205   ret <8 x float> %res
    206 }
    207 
    208 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
    209 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
    210 ; CHECK:       # %bb.0:
    211 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    212 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
    213 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
    214 ; CHECK-NEXT:    retq
    215   %vec2 = load <8 x float>, <8 x float>* %vec2p
    216   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    217   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    218   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
    219   ret <8 x float> %res
    220 }
    221 
    222 define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
    223 ; CHECK-LABEL: test_8xfloat_shuff_mem_mask3:
    224 ; CHECK:       # %bb.0:
    225 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
    226 ; CHECK-NEXT:    retq
    227   %vec2 = load <8 x float>, <8 x float>* %vec2p
    228   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    229   ret <8 x float> %res
    230 }
    231 define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
    232 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
    233 ; CHECK:       # %bb.0:
    234 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    235 ; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
    236 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
    237 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
    238 ; CHECK-NEXT:    retq
    239   %vec2 = load <8 x float>, <8 x float>* %vec2p
    240   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    241   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    242   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
    243   ret <8 x float> %res
    244 }
    245 
    246 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
    247 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
    248 ; CHECK:       # %bb.0:
    249 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    250 ; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
    251 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
    252 ; CHECK-NEXT:    retq
    253   %vec2 = load <8 x float>, <8 x float>* %vec2p
    254   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    255   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
    256   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
    257   ret <8 x float> %res
    258 }
    259 
    260 define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
    261 ; CHECK-LABEL: test_16xfloat_shuff_mask0:
    262 ; CHECK:       # %bb.0:
    263 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
    264 ; CHECK-NEXT:    retq
    265   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
    266   ret <16 x float> %res
    267 }
    268 define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
    269 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
    270 ; CHECK:       # %bb.0:
    271 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
    272 ; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
    273 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
    274 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
    275 ; CHECK-NEXT:    retq
    276   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
    277   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    278   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
    279   ret <16 x float> %res
    280 }
    281 
    282 define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
    283 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
    284 ; CHECK:       # %bb.0:
    285 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    286 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
    287 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
    288 ; CHECK-NEXT:    retq
    289   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
    290   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    291   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
    292   ret <16 x float> %res
    293 }
    294 define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
    295 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
    296 ; CHECK:       # %bb.0:
    297 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
    298 ; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
    299 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
    300 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
    301 ; CHECK-NEXT:    retq
    302   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
    303   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    304   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
    305   ret <16 x float> %res
    306 }
    307 
    308 define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
    309 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
    310 ; CHECK:       # %bb.0:
    311 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    312 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
    313 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
    314 ; CHECK-NEXT:    retq
    315   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
    316   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    317   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
    318   ret <16 x float> %res
    319 }
    320 define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
    321 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
    322 ; CHECK:       # %bb.0:
    323 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
    324 ; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
    325 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
    326 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
    327 ; CHECK-NEXT:    retq
    328   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
    329   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    330   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
    331   ret <16 x float> %res
    332 }
    333 
    334 define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
    335 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
    336 ; CHECK:       # %bb.0:
    337 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    338 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
    339 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
    340 ; CHECK-NEXT:    retq
    341   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
    342   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    343   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
    344   ret <16 x float> %res
    345 }
    346 define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
    347 ; CHECK-LABEL: test_16xfloat_shuff_mask3:
    348 ; CHECK:       # %bb.0:
    349 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
    350 ; CHECK-NEXT:    retq
    351   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
    352   ret <16 x float> %res
    353 }
    354 define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
    355 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
    356 ; CHECK:       # %bb.0:
    357 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
    358 ; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
    359 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
    360 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
    361 ; CHECK-NEXT:    retq
    362   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
    363   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    364   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
    365   ret <16 x float> %res
    366 }
    367 
    368 define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
    369 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
    370 ; CHECK:       # %bb.0:
    371 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    372 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
    373 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
    374 ; CHECK-NEXT:    retq
    375   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
    376   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    377   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
    378   ret <16 x float> %res
    379 }
    380 define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
    381 ; CHECK-LABEL: test_16xfloat_shuff_mem_mask0:
    382 ; CHECK:       # %bb.0:
    383 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
    384 ; CHECK-NEXT:    retq
    385   %vec2 = load <16 x float>, <16 x float>* %vec2p
    386   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
    387   ret <16 x float> %res
    388 }
    389 define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
    390 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
    391 ; CHECK:       # %bb.0:
    392 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    393 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
    394 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
    395 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    396 ; CHECK-NEXT:    retq
    397   %vec2 = load <16 x float>, <16 x float>* %vec2p
    398   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
    399   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    400   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
    401   ret <16 x float> %res
    402 }
    403 
    404 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
    405 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
    406 ; CHECK:       # %bb.0:
    407 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    408 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
    409 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
    410 ; CHECK-NEXT:    retq
    411   %vec2 = load <16 x float>, <16 x float>* %vec2p
    412   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
    413   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    414   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
    415   ret <16 x float> %res
    416 }
    417 
    418 define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
    419 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
    420 ; CHECK:       # %bb.0:
    421 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    422 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
    423 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
    424 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    425 ; CHECK-NEXT:    retq
    426   %vec2 = load <16 x float>, <16 x float>* %vec2p
    427   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
    428   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    429   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
    430   ret <16 x float> %res
    431 }
    432 
    433 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
    434 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
    435 ; CHECK:       # %bb.0:
    436 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    437 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
    438 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
    439 ; CHECK-NEXT:    retq
    440   %vec2 = load <16 x float>, <16 x float>* %vec2p
    441   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
    442   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    443   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
    444   ret <16 x float> %res
    445 }
    446 
    447 define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
    448 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
    449 ; CHECK:       # %bb.0:
    450 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    451 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
    452 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
    453 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    454 ; CHECK-NEXT:    retq
    455   %vec2 = load <16 x float>, <16 x float>* %vec2p
    456   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
    457   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    458   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
    459   ret <16 x float> %res
    460 }
    461 
    462 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
    463 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
    464 ; CHECK:       # %bb.0:
    465 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    466 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
    467 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
    468 ; CHECK-NEXT:    retq
    469   %vec2 = load <16 x float>, <16 x float>* %vec2p
    470   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
    471   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    472   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
    473   ret <16 x float> %res
    474 }
    475 
    476 define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
    477 ; CHECK-LABEL: test_16xfloat_shuff_mem_mask3:
    478 ; CHECK:       # %bb.0:
    479 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
    480 ; CHECK-NEXT:    retq
    481   %vec2 = load <16 x float>, <16 x float>* %vec2p
    482   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
    483   ret <16 x float> %res
    484 }
    485 define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
    486 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
    487 ; CHECK:       # %bb.0:
    488 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
    489 ; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
    490 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
    491 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
    492 ; CHECK-NEXT:    retq
    493   %vec2 = load <16 x float>, <16 x float>* %vec2p
    494   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
    495   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    496   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
    497   ret <16 x float> %res
    498 }
    499 
    500 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
    501 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
    502 ; CHECK:       # %bb.0:
    503 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    504 ; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
    505 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
    506 ; CHECK-NEXT:    retq
    507   %vec2 = load <16 x float>, <16 x float>* %vec2p
    508   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
    509   %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
    510   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
    511   ret <16 x float> %res
    512 }
    513 
    514 define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
    515 ; CHECK-LABEL: test_4xdouble_shuff_mask0:
    516 ; CHECK:       # %bb.0:
    517 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
    518 ; CHECK-NEXT:    retq
    519   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    520   ret <4 x double> %res
    521 }
    522 define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
    523 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
    524 ; CHECK:       # %bb.0:
    525 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
    526 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
    527 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
    528 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
    529 ; CHECK-NEXT:    retq
    530   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    531   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    532   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
    533   ret <4 x double> %res
    534 }
    535 
    536 define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
    537 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
    538 ; CHECK:       # %bb.0:
    539 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    540 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    541 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
    542 ; CHECK-NEXT:    retq
    543   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    544   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    545   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    546   ret <4 x double> %res
    547 }
    548 define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
    549 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
    550 ; CHECK:       # %bb.0:
    551 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
    552 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
    553 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
    554 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
    555 ; CHECK-NEXT:    retq
    556   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    557   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    558   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
    559   ret <4 x double> %res
    560 }
    561 
    562 define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
    563 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
    564 ; CHECK:       # %bb.0:
    565 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    566 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    567 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
    568 ; CHECK-NEXT:    retq
    569   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    570   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    571   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    572   ret <4 x double> %res
    573 }
    574 define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
    575 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
    576 ; CHECK:       # %bb.0:
    577 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
    578 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
    579 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
    580 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
    581 ; CHECK-NEXT:    retq
    582   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    583   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    584   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
    585   ret <4 x double> %res
    586 }
    587 
    588 define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
    589 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
    590 ; CHECK:       # %bb.0:
    591 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    592 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    593 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
    594 ; CHECK-NEXT:    retq
    595   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    596   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    597   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    598   ret <4 x double> %res
    599 }
    600 define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
    601 ; CHECK-LABEL: test_4xdouble_shuff_mask3:
    602 ; CHECK:       # %bb.0:
    603 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
    604 ; CHECK-NEXT:    retq
    605   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    606   ret <4 x double> %res
    607 }
    608 define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
    609 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
    610 ; CHECK:       # %bb.0:
    611 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
    612 ; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
    613 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
    614 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
    615 ; CHECK-NEXT:    retq
    616   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    617   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    618   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
    619   ret <4 x double> %res
    620 }
    621 
    622 define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
    623 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
    624 ; CHECK:       # %bb.0:
    625 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    626 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    627 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
    628 ; CHECK-NEXT:    retq
    629   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    630   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    631   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    632   ret <4 x double> %res
    633 }
    634 define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
    635 ; CHECK-LABEL: test_4xdouble_shuff_mem_mask0:
    636 ; CHECK:       # %bb.0:
    637 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    638 ; CHECK-NEXT:    retq
    639   %vec2 = load <4 x double>, <4 x double>* %vec2p
    640   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    641   ret <4 x double> %res
    642 }
    643 define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
    644 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
    645 ; CHECK:       # %bb.0:
    646 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    647 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    648 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
    649 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    650 ; CHECK-NEXT:    retq
    651   %vec2 = load <4 x double>, <4 x double>* %vec2p
    652   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    653   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    654   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
    655   ret <4 x double> %res
    656 }
    657 
    658 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
    659 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
    660 ; CHECK:       # %bb.0:
    661 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    662 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    663 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
    664 ; CHECK-NEXT:    retq
    665   %vec2 = load <4 x double>, <4 x double>* %vec2p
    666   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    667   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    668   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    669   ret <4 x double> %res
    670 }
    671 
    672 define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
    673 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
    674 ; CHECK:       # %bb.0:
    675 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    676 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    677 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
    678 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    679 ; CHECK-NEXT:    retq
    680   %vec2 = load <4 x double>, <4 x double>* %vec2p
    681   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    682   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    683   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
    684   ret <4 x double> %res
    685 }
    686 
    687 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
    688 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
    689 ; CHECK:       # %bb.0:
    690 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    691 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    692 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
    693 ; CHECK-NEXT:    retq
    694   %vec2 = load <4 x double>, <4 x double>* %vec2p
    695   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    696   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    697   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    698   ret <4 x double> %res
    699 }
    700 
    701 define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
    702 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
    703 ; CHECK:       # %bb.0:
    704 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    705 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    706 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
    707 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    708 ; CHECK-NEXT:    retq
    709   %vec2 = load <4 x double>, <4 x double>* %vec2p
    710   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    711   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    712   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
    713   ret <4 x double> %res
    714 }
    715 
    716 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
    717 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
    718 ; CHECK:       # %bb.0:
    719 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    720 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    721 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
    722 ; CHECK-NEXT:    retq
    723   %vec2 = load <4 x double>, <4 x double>* %vec2p
    724   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
    725   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    726   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    727   ret <4 x double> %res
    728 }
    729 
    730 define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
    731 ; CHECK-LABEL: test_4xdouble_shuff_mem_mask3:
    732 ; CHECK:       # %bb.0:
    733 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
    734 ; CHECK-NEXT:    retq
    735   %vec2 = load <4 x double>, <4 x double>* %vec2p
    736   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    737   ret <4 x double> %res
    738 }
    739 define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
    740 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
    741 ; CHECK:       # %bb.0:
    742 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    743 ; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
    744 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
    745 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
    746 ; CHECK-NEXT:    retq
    747   %vec2 = load <4 x double>, <4 x double>* %vec2p
    748   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    749   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    750   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
    751   ret <4 x double> %res
    752 }
    753 
    754 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
    755 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
    756 ; CHECK:       # %bb.0:
    757 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    758 ; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
    759 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
    760 ; CHECK-NEXT:    retq
    761   %vec2 = load <4 x double>, <4 x double>* %vec2p
    762   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    763   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
    764   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
    765   ret <4 x double> %res
    766 }
    767 
    768 define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
    769 ; CHECK-LABEL: test_8xdouble_shuff_mask0:
    770 ; CHECK:       # %bb.0:
    771 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1]
    772 ; CHECK-NEXT:    retq
    773   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
    774   ret <8 x double> %res
    775 }
    776 define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
    777 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
    778 ; CHECK:       # %bb.0:
    779 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
    780 ; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
    781 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1]
    782 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
    783 ; CHECK-NEXT:    retq
    784   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
    785   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    786   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
    787   ret <8 x double> %res
    788 }
    789 
    790 define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
    791 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
    792 ; CHECK:       # %bb.0:
    793 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    794 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    795 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1]
    796 ; CHECK-NEXT:    retq
    797   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
    798   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    799   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    800   ret <8 x double> %res
    801 }
    802 define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
    803 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
    804 ; CHECK:       # %bb.0:
    805 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
    806 ; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
    807 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5]
    808 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
    809 ; CHECK-NEXT:    retq
    810   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
    811   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    812   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
    813   ret <8 x double> %res
    814 }
    815 
    816 define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
    817 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
    818 ; CHECK:       # %bb.0:
    819 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    820 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    821 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5]
    822 ; CHECK-NEXT:    retq
    823   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
    824   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    825   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    826   ret <8 x double> %res
    827 }
    828 define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
    829 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
    830 ; CHECK:       # %bb.0:
    831 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
    832 ; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
    833 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1]
    834 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
    835 ; CHECK-NEXT:    retq
    836   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
    837   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    838   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
    839   ret <8 x double> %res
    840 }
    841 
    842 define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
    843 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
    844 ; CHECK:       # %bb.0:
    845 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    846 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    847 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1]
    848 ; CHECK-NEXT:    retq
    849   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
    850   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    851   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    852   ret <8 x double> %res
    853 }
    854 define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
    855 ; CHECK-LABEL: test_8xdouble_shuff_mask3:
    856 ; CHECK:       # %bb.0:
    857 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3]
    858 ; CHECK-NEXT:    retq
    859   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
    860   ret <8 x double> %res
    861 }
    862 define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
    863 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
    864 ; CHECK:       # %bb.0:
    865 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
    866 ; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
    867 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3]
    868 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
    869 ; CHECK-NEXT:    retq
    870   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
    871   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    872   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
    873   ret <8 x double> %res
    874 }
    875 
    876 define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
    877 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
    878 ; CHECK:       # %bb.0:
    879 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    880 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    881 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3]
    882 ; CHECK-NEXT:    retq
    883   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
    884   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    885   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    886   ret <8 x double> %res
    887 }
    888 define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
    889 ; CHECK-LABEL: test_8xdouble_shuff_mem_mask0:
    890 ; CHECK:       # %bb.0:
    891 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1]
    892 ; CHECK-NEXT:    retq
    893   %vec2 = load <8 x double>, <8 x double>* %vec2p
    894   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
    895   ret <8 x double> %res
    896 }
    897 define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
    898 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
    899 ; CHECK:       # %bb.0:
    900 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    901 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    902 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1]
    903 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    904 ; CHECK-NEXT:    retq
    905   %vec2 = load <8 x double>, <8 x double>* %vec2p
    906   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
    907   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    908   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
    909   ret <8 x double> %res
    910 }
    911 
    912 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
    913 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
    914 ; CHECK:       # %bb.0:
    915 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    916 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    917 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1]
    918 ; CHECK-NEXT:    retq
    919   %vec2 = load <8 x double>, <8 x double>* %vec2p
    920   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
    921   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    922   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    923   ret <8 x double> %res
    924 }
    925 
    926 define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
    927 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
    928 ; CHECK:       # %bb.0:
    929 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    930 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    931 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3]
    932 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    933 ; CHECK-NEXT:    retq
    934   %vec2 = load <8 x double>, <8 x double>* %vec2p
    935   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    936   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    937   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
    938   ret <8 x double> %res
    939 }
    940 
    941 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
    942 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
    943 ; CHECK:       # %bb.0:
    944 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    945 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    946 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3]
    947 ; CHECK-NEXT:    retq
    948   %vec2 = load <8 x double>, <8 x double>* %vec2p
    949   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
    950   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    951   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    952   ret <8 x double> %res
    953 }
    954 
    955 define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
    956 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
    957 ; CHECK:       # %bb.0:
    958 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    959 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    960 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5]
    961 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    962 ; CHECK-NEXT:    retq
    963   %vec2 = load <8 x double>, <8 x double>* %vec2p
    964   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
    965   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    966   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
    967   ret <8 x double> %res
    968 }
    969 
    970 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
    971 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
    972 ; CHECK:       # %bb.0:
    973 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
    974 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
    975 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5]
    976 ; CHECK-NEXT:    retq
    977   %vec2 = load <8 x double>, <8 x double>* %vec2p
    978   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
    979   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
    980   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
    981   ret <8 x double> %res
    982 }
    983 
    984 define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
    985 ; CHECK-LABEL: test_8xdouble_shuff_mem_mask3:
    986 ; CHECK:       # %bb.0:
    987 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1]
    988 ; CHECK-NEXT:    retq
    989   %vec2 = load <8 x double>, <8 x double>* %vec2p
    990   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
    991   ret <8 x double> %res
    992 }
    993 define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
    994 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
    995 ; CHECK:       # %bb.0:
    996 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
    997 ; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
    998 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1]
    999 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
   1000 ; CHECK-NEXT:    retq
   1001   %vec2 = load <8 x double>, <8 x double>* %vec2p
   1002   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   1003   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
   1004   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   1005   ret <8 x double> %res
   1006 }
   1007 
   1008 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
   1009 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
   1010 ; CHECK:       # %bb.0:
   1011 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
   1012 ; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
   1013 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1]
   1014 ; CHECK-NEXT:    retq
   1015   %vec2 = load <8 x double>, <8 x double>* %vec2p
   1016   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   1017   %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
   1018   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   1019   ret <8 x double> %res
   1020 }
   1021 
   1022 define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
   1023 ; CHECK-LABEL: test_8xi32_shuff_mask0:
   1024 ; CHECK:       # %bb.0:
   1025 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1026 ; CHECK-NEXT:    retq
   1027   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1028   ret <8 x i32> %res
   1029 }
   1030 define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
   1031 ; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
   1032 ; CHECK:       # %bb.0:
   1033 ; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
   1034 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
   1035 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
   1036 ; CHECK-NEXT:    retq
   1037   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1038   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1039   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   1040   ret <8 x i32> %res
   1041 }
   1042 
   1043 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
   1044 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
   1045 ; CHECK:       # %bb.0:
   1046 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   1047 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
   1048 ; CHECK-NEXT:    retq
   1049   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1050   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1051   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1052   ret <8 x i32> %res
   1053 }
   1054 define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
   1055 ; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
   1056 ; CHECK:       # %bb.0:
   1057 ; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
   1058 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
   1059 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
   1060 ; CHECK-NEXT:    retq
   1061   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1062   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1063   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   1064   ret <8 x i32> %res
   1065 }
   1066 
   1067 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
   1068 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
   1069 ; CHECK:       # %bb.0:
   1070 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   1071 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
   1072 ; CHECK-NEXT:    retq
   1073   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1074   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1075   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1076   ret <8 x i32> %res
   1077 }
   1078 define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
   1079 ; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
   1080 ; CHECK:       # %bb.0:
   1081 ; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
   1082 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
   1083 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
   1084 ; CHECK-NEXT:    retq
   1085   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1086   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1087   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   1088   ret <8 x i32> %res
   1089 }
   1090 
   1091 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
   1092 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
   1093 ; CHECK:       # %bb.0:
   1094 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   1095 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
   1096 ; CHECK-NEXT:    retq
   1097   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1098   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1099   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1100   ret <8 x i32> %res
   1101 }
   1102 define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
   1103 ; CHECK-LABEL: test_8xi32_shuff_mask3:
   1104 ; CHECK:       # %bb.0:
   1105 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
   1106 ; CHECK-NEXT:    retq
   1107   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1108   ret <8 x i32> %res
   1109 }
   1110 define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
   1111 ; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
   1112 ; CHECK:       # %bb.0:
   1113 ; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
   1114 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
   1115 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
   1116 ; CHECK-NEXT:    retq
   1117   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1118   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1119   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   1120   ret <8 x i32> %res
   1121 }
   1122 
   1123 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
   1124 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
   1125 ; CHECK:       # %bb.0:
   1126 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   1127 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
   1128 ; CHECK-NEXT:    retq
   1129   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1130   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1131   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1132   ret <8 x i32> %res
   1133 }
   1134 define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
   1135 ; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
   1136 ; CHECK:       # %bb.0:
   1137 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
   1138 ; CHECK-NEXT:    retq
   1139   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1140   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1141   ret <8 x i32> %res
   1142 }
   1143 define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
   1144 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
   1145 ; CHECK:       # %bb.0:
   1146 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   1147 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
   1148 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1149 ; CHECK-NEXT:    retq
   1150   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1151   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1152   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1153   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   1154   ret <8 x i32> %res
   1155 }
   1156 
   1157 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
   1158 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
   1159 ; CHECK:       # %bb.0:
   1160 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   1161 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
   1162 ; CHECK-NEXT:    retq
   1163   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1164   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1165   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1166   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1167   ret <8 x i32> %res
   1168 }
   1169 
   1170 define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
   1171 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
   1172 ; CHECK:       # %bb.0:
   1173 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   1174 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
   1175 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1176 ; CHECK-NEXT:    retq
   1177   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1178   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1179   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1180   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   1181   ret <8 x i32> %res
   1182 }
   1183 
   1184 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
   1185 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
   1186 ; CHECK:       # %bb.0:
   1187 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   1188 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
   1189 ; CHECK-NEXT:    retq
   1190   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1191   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1192   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1193   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1194   ret <8 x i32> %res
   1195 }
   1196 
   1197 define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
   1198 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
   1199 ; CHECK:       # %bb.0:
   1200 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   1201 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
   1202 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1203 ; CHECK-NEXT:    retq
   1204   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1205   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1206   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1207   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   1208   ret <8 x i32> %res
   1209 }
   1210 
   1211 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
   1212 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
   1213 ; CHECK:       # %bb.0:
   1214 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   1215 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
   1216 ; CHECK-NEXT:    retq
   1217   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1218   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1219   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1220   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1221   ret <8 x i32> %res
   1222 }
   1223 
   1224 define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
   1225 ; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
   1226 ; CHECK:       # %bb.0:
   1227 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
   1228 ; CHECK-NEXT:    retq
   1229   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1230   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1231   ret <8 x i32> %res
   1232 }
   1233 define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
   1234 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
   1235 ; CHECK:       # %bb.0:
   1236 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   1237 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
   1238 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1239 ; CHECK-NEXT:    retq
   1240   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1241   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1242   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1243   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   1244   ret <8 x i32> %res
   1245 }
   1246 
   1247 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
   1248 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
   1249 ; CHECK:       # %bb.0:
   1250 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   1251 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
   1252 ; CHECK-NEXT:    retq
   1253   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   1254   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1255   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1256   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1257   ret <8 x i32> %res
   1258 }
   1259 
   1260 define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
   1261 ; CHECK-LABEL: test_16xi32_shuff_mask0:
   1262 ; CHECK:       # %bb.0:
   1263 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
   1264 ; CHECK-NEXT:    retq
   1265   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   1266   ret <16 x i32> %res
   1267 }
   1268 define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
   1269 ; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
   1270 ; CHECK:       # %bb.0:
   1271 ; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
   1272 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
   1273 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   1274 ; CHECK-NEXT:    retq
   1275   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   1276   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1277   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   1278   ret <16 x i32> %res
   1279 }
   1280 
   1281 define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
   1282 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
   1283 ; CHECK:       # %bb.0:
   1284 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   1285 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
   1286 ; CHECK-NEXT:    retq
   1287   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   1288   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1289   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1290   ret <16 x i32> %res
   1291 }
   1292 define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
   1293 ; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
   1294 ; CHECK:       # %bb.0:
   1295 ; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
   1296 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
   1297 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   1298 ; CHECK-NEXT:    retq
   1299   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   1300   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1301   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   1302   ret <16 x i32> %res
   1303 }
   1304 
   1305 define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
   1306 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
   1307 ; CHECK:       # %bb.0:
   1308 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   1309 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
   1310 ; CHECK-NEXT:    retq
   1311   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   1312   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1313   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1314   ret <16 x i32> %res
   1315 }
   1316 define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
   1317 ; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
   1318 ; CHECK:       # %bb.0:
   1319 ; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
   1320 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
   1321 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   1322 ; CHECK-NEXT:    retq
   1323   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   1324   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1325   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   1326   ret <16 x i32> %res
   1327 }
   1328 
   1329 define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
   1330 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
   1331 ; CHECK:       # %bb.0:
   1332 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   1333 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
   1334 ; CHECK-NEXT:    retq
   1335   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   1336   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1337   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1338   ret <16 x i32> %res
   1339 }
   1340 define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
   1341 ; CHECK-LABEL: test_16xi32_shuff_mask3:
   1342 ; CHECK:       # %bb.0:
   1343 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
   1344 ; CHECK-NEXT:    retq
   1345   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   1346   ret <16 x i32> %res
   1347 }
   1348 define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
   1349 ; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
   1350 ; CHECK:       # %bb.0:
   1351 ; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
   1352 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
   1353 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   1354 ; CHECK-NEXT:    retq
   1355   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   1356   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1357   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   1358   ret <16 x i32> %res
   1359 }
   1360 
   1361 define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
   1362 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
   1363 ; CHECK:       # %bb.0:
   1364 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   1365 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
   1366 ; CHECK-NEXT:    retq
   1367   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   1368   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1369   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1370   ret <16 x i32> %res
   1371 }
   1372 define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
   1373 ; CHECK-LABEL: test_16xi32_shuff_mem_mask0:
   1374 ; CHECK:       # %bb.0:
   1375 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
   1376 ; CHECK-NEXT:    retq
   1377   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1378   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   1379   ret <16 x i32> %res
   1380 }
   1381 define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
   1382 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
   1383 ; CHECK:       # %bb.0:
   1384 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   1385 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
   1386 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1387 ; CHECK-NEXT:    retq
   1388   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1389   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   1390   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1391   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   1392   ret <16 x i32> %res
   1393 }
   1394 
   1395 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
   1396 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
   1397 ; CHECK:       # %bb.0:
   1398 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   1399 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
   1400 ; CHECK-NEXT:    retq
   1401   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1402   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   1403   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1404   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1405   ret <16 x i32> %res
   1406 }
   1407 
   1408 define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
   1409 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
   1410 ; CHECK:       # %bb.0:
   1411 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   1412 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
   1413 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1414 ; CHECK-NEXT:    retq
   1415   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1416   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   1417   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1418   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   1419   ret <16 x i32> %res
   1420 }
   1421 
   1422 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
   1423 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
   1424 ; CHECK:       # %bb.0:
   1425 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   1426 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
   1427 ; CHECK-NEXT:    retq
   1428   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1429   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   1430   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1431   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1432   ret <16 x i32> %res
   1433 }
   1434 
   1435 define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
   1436 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
   1437 ; CHECK:       # %bb.0:
   1438 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   1439 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
   1440 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1441 ; CHECK-NEXT:    retq
   1442   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1443   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   1444   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1445   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   1446   ret <16 x i32> %res
   1447 }
   1448 
   1449 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
   1450 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
   1451 ; CHECK:       # %bb.0:
   1452 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   1453 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
   1454 ; CHECK-NEXT:    retq
   1455   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1456   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   1457   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1458   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1459   ret <16 x i32> %res
   1460 }
   1461 
   1462 define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
   1463 ; CHECK-LABEL: test_16xi32_shuff_mem_mask3:
   1464 ; CHECK:       # %bb.0:
   1465 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
   1466 ; CHECK-NEXT:    retq
   1467   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1468   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   1469   ret <16 x i32> %res
   1470 }
   1471 define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
   1472 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
   1473 ; CHECK:       # %bb.0:
   1474 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   1475 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
   1476 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1477 ; CHECK-NEXT:    retq
   1478   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1479   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   1480   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1481   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   1482   ret <16 x i32> %res
   1483 }
   1484 
   1485 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
   1486 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
   1487 ; CHECK:       # %bb.0:
   1488 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   1489 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
   1490 ; CHECK-NEXT:    retq
   1491   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   1492   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   1493   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1494   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1495   ret <16 x i32> %res
   1496 }
   1497 
   1498 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
   1499 ; CHECK-LABEL: test_4xi64_shuff_mask0:
   1500 ; CHECK:       # %bb.0:
   1501 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
   1502 ; CHECK-NEXT:    retq
   1503   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1504   ret <4 x i64> %res
   1505 }
   1506 define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
   1507 ; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
   1508 ; CHECK:       # %bb.0:
   1509 ; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
   1510 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
   1511 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
   1512 ; CHECK-NEXT:    retq
   1513   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1514   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1515   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   1516   ret <4 x i64> %res
   1517 }
   1518 
   1519 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
   1520 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
   1521 ; CHECK:       # %bb.0:
   1522 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
   1523 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
   1524 ; CHECK-NEXT:    retq
   1525   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1526   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1527   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1528   ret <4 x i64> %res
   1529 }
   1530 define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
   1531 ; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
   1532 ; CHECK:       # %bb.0:
   1533 ; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
   1534 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
   1535 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
   1536 ; CHECK-NEXT:    retq
   1537   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1538   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1539   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   1540   ret <4 x i64> %res
   1541 }
   1542 
   1543 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
   1544 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
   1545 ; CHECK:       # %bb.0:
   1546 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
   1547 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
   1548 ; CHECK-NEXT:    retq
   1549   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1550   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1551   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1552   ret <4 x i64> %res
   1553 }
   1554 define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
   1555 ; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
   1556 ; CHECK:       # %bb.0:
   1557 ; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
   1558 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
   1559 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
   1560 ; CHECK-NEXT:    retq
   1561   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1562   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1563   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   1564   ret <4 x i64> %res
   1565 }
   1566 
   1567 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
   1568 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
   1569 ; CHECK:       # %bb.0:
   1570 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
   1571 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
   1572 ; CHECK-NEXT:    retq
   1573   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1574   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1575   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1576   ret <4 x i64> %res
   1577 }
   1578 define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
   1579 ; CHECK-LABEL: test_4xi64_shuff_mask3:
   1580 ; CHECK:       # %bb.0:
   1581 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1582 ; CHECK-NEXT:    retq
   1583   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1584   ret <4 x i64> %res
   1585 }
   1586 define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
   1587 ; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
   1588 ; CHECK:       # %bb.0:
   1589 ; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
   1590 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
   1591 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
   1592 ; CHECK-NEXT:    retq
   1593   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1594   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1595   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   1596   ret <4 x i64> %res
   1597 }
   1598 
   1599 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
   1600 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
   1601 ; CHECK:       # %bb.0:
   1602 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
   1603 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
   1604 ; CHECK-NEXT:    retq
   1605   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1606   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1607   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1608   ret <4 x i64> %res
   1609 }
   1610 define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
   1611 ; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
   1612 ; CHECK:       # %bb.0:
   1613 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
   1614 ; CHECK-NEXT:    retq
   1615   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1616   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1617   ret <4 x i64> %res
   1618 }
   1619 define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
   1620 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
   1621 ; CHECK:       # %bb.0:
   1622 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
   1623 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
   1624 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1625 ; CHECK-NEXT:    retq
   1626   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1627   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1628   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1629   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   1630   ret <4 x i64> %res
   1631 }
   1632 
   1633 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
   1634 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
   1635 ; CHECK:       # %bb.0:
   1636 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
   1637 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
   1638 ; CHECK-NEXT:    retq
   1639   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1640   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1641   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1642   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1643   ret <4 x i64> %res
   1644 }
   1645 
   1646 define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
   1647 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
   1648 ; CHECK:       # %bb.0:
   1649 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
   1650 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
   1651 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1652 ; CHECK-NEXT:    retq
   1653   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1654   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1655   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1656   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   1657   ret <4 x i64> %res
   1658 }
   1659 
   1660 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
   1661 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
   1662 ; CHECK:       # %bb.0:
   1663 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
   1664 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
   1665 ; CHECK-NEXT:    retq
   1666   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1667   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1668   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1669   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1670   ret <4 x i64> %res
   1671 }
   1672 
   1673 define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
   1674 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
   1675 ; CHECK:       # %bb.0:
   1676 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
   1677 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
   1678 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1679 ; CHECK-NEXT:    retq
   1680   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1681   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1682   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1683   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   1684   ret <4 x i64> %res
   1685 }
   1686 
   1687 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
   1688 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
   1689 ; CHECK:       # %bb.0:
   1690 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
   1691 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
   1692 ; CHECK-NEXT:    retq
   1693   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1694   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   1695   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1696   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1697   ret <4 x i64> %res
   1698 }
   1699 
   1700 define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
   1701 ; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
   1702 ; CHECK:       # %bb.0:
   1703 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
   1704 ; CHECK-NEXT:    retq
   1705   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1706   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1707   ret <4 x i64> %res
   1708 }
   1709 define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
   1710 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
   1711 ; CHECK:       # %bb.0:
   1712 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
   1713 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
   1714 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1715 ; CHECK-NEXT:    retq
   1716   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1717   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1718   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1719   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   1720   ret <4 x i64> %res
   1721 }
   1722 
   1723 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
   1724 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
   1725 ; CHECK:       # %bb.0:
   1726 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
   1727 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
   1728 ; CHECK-NEXT:    retq
   1729   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   1730   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1731   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1732   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1733   ret <4 x i64> %res
   1734 }
   1735 
   1736 define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
   1737 ; CHECK-LABEL: test_8xi64_shuff_mask0:
   1738 ; CHECK:       # %bb.0:
   1739 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5]
   1740 ; CHECK-NEXT:    retq
   1741   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   1742   ret <8 x i64> %res
   1743 }
   1744 define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
   1745 ; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
   1746 ; CHECK:       # %bb.0:
   1747 ; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
   1748 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
   1749 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   1750 ; CHECK-NEXT:    retq
   1751   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   1752   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1753   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   1754   ret <8 x i64> %res
   1755 }
   1756 
   1757 define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
   1758 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
   1759 ; CHECK:       # %bb.0:
   1760 ; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
   1761 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
   1762 ; CHECK-NEXT:    retq
   1763   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   1764   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1765   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1766   ret <8 x i64> %res
   1767 }
   1768 define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
   1769 ; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
   1770 ; CHECK:       # %bb.0:
   1771 ; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
   1772 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
   1773 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   1774 ; CHECK-NEXT:    retq
   1775   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   1776   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1777   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   1778   ret <8 x i64> %res
   1779 }
   1780 
   1781 define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
   1782 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
   1783 ; CHECK:       # %bb.0:
   1784 ; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
   1785 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
   1786 ; CHECK-NEXT:    retq
   1787   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   1788   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1789   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1790   ret <8 x i64> %res
   1791 }
   1792 define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
   1793 ; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
   1794 ; CHECK:       # %bb.0:
   1795 ; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
   1796 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
   1797 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   1798 ; CHECK-NEXT:    retq
   1799   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   1800   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1801   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   1802   ret <8 x i64> %res
   1803 }
   1804 
   1805 define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
   1806 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
   1807 ; CHECK:       # %bb.0:
   1808 ; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
   1809 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
   1810 ; CHECK-NEXT:    retq
   1811   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   1812   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1813   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1814   ret <8 x i64> %res
   1815 }
   1816 define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
   1817 ; CHECK-LABEL: test_8xi64_shuff_mask3:
   1818 ; CHECK:       # %bb.0:
   1819 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3]
   1820 ; CHECK-NEXT:    retq
   1821   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   1822   ret <8 x i64> %res
   1823 }
   1824 define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
   1825 ; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
   1826 ; CHECK:       # %bb.0:
   1827 ; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
   1828 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
   1829 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
   1830 ; CHECK-NEXT:    retq
   1831   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   1832   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1833   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   1834   ret <8 x i64> %res
   1835 }
   1836 
   1837 define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
   1838 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
   1839 ; CHECK:       # %bb.0:
   1840 ; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
   1841 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
   1842 ; CHECK-NEXT:    retq
   1843   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   1844   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1845   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1846   ret <8 x i64> %res
   1847 }
   1848 define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
   1849 ; CHECK-LABEL: test_8xi64_shuff_mem_mask0:
   1850 ; CHECK:       # %bb.0:
   1851 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3]
   1852 ; CHECK-NEXT:    retq
   1853   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1854   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   1855   ret <8 x i64> %res
   1856 }
   1857 define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
   1858 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
   1859 ; CHECK:       # %bb.0:
   1860 ; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
   1861 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
   1862 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1863 ; CHECK-NEXT:    retq
   1864   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1865   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   1866   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1867   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   1868   ret <8 x i64> %res
   1869 }
   1870 
   1871 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
   1872 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
   1873 ; CHECK:       # %bb.0:
   1874 ; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
   1875 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
   1876 ; CHECK-NEXT:    retq
   1877   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1878   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   1879   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1880   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1881   ret <8 x i64> %res
   1882 }
   1883 
   1884 define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
   1885 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
   1886 ; CHECK:       # %bb.0:
   1887 ; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
   1888 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
   1889 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1890 ; CHECK-NEXT:    retq
   1891   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1892   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   1893   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1894   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   1895   ret <8 x i64> %res
   1896 }
   1897 
   1898 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
   1899 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
   1900 ; CHECK:       # %bb.0:
   1901 ; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
   1902 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
   1903 ; CHECK-NEXT:    retq
   1904   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1905   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   1906   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1907   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1908   ret <8 x i64> %res
   1909 }
   1910 
   1911 define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
   1912 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
   1913 ; CHECK:       # %bb.0:
   1914 ; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
   1915 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
   1916 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1917 ; CHECK-NEXT:    retq
   1918   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1919   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
   1920   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1921   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   1922   ret <8 x i64> %res
   1923 }
   1924 
   1925 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
   1926 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
   1927 ; CHECK:       # %bb.0:
   1928 ; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
   1929 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
   1930 ; CHECK-NEXT:    retq
   1931   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1932   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
   1933   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1934   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1935   ret <8 x i64> %res
   1936 }
   1937 
   1938 define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
   1939 ; CHECK-LABEL: test_8xi64_shuff_mem_mask3:
   1940 ; CHECK:       # %bb.0:
   1941 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3]
   1942 ; CHECK-NEXT:    retq
   1943   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1944   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   1945   ret <8 x i64> %res
   1946 }
   1947 define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
   1948 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
   1949 ; CHECK:       # %bb.0:
   1950 ; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
   1951 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
   1952 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1953 ; CHECK-NEXT:    retq
   1954   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1955   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   1956   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1957   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   1958   ret <8 x i64> %res
   1959 }
   1960 
   1961 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
   1962 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
   1963 ; CHECK:       # %bb.0:
   1964 ; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
   1965 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
   1966 ; CHECK-NEXT:    retq
   1967   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   1968   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   1969   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1970   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1971   ret <8 x i64> %res
   1972 }
   1973 
   1974