Home | History | Annotate | Download | only in avx512-shuffles
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
      3 
      4 define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
      5 ; CHECK-LABEL: test_16xi8_perm_mask0:
      6 ; CHECK:       # %bb.0:
      7 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
      8 ; CHECK-NEXT:    retq
      9   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
     10   ret <16 x i8> %res
     11 }
     12 define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
     13 ; CHECK-LABEL: test_masked_16xi8_perm_mask0:
     14 ; CHECK:       # %bb.0:
     15 ; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
     16 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
     17 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
     18 ; CHECK-NEXT:    retq
     19   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
     20   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
     21   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
     22   ret <16 x i8> %res
     23 }
     24 
     25 define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
     26 ; CHECK-LABEL: test_masked_z_16xi8_perm_mask0:
     27 ; CHECK:       # %bb.0:
     28 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
     29 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
     30 ; CHECK-NEXT:    retq
     31   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
     32   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
     33   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
     34   ret <16 x i8> %res
     35 }
     36 define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
     37 ; CHECK-LABEL: test_masked_16xi8_perm_mask1:
     38 ; CHECK:       # %bb.0:
     39 ; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
     40 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
     41 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
     42 ; CHECK-NEXT:    retq
     43   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
     44   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
     45   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
     46   ret <16 x i8> %res
     47 }
     48 
     49 define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
     50 ; CHECK-LABEL: test_masked_z_16xi8_perm_mask1:
     51 ; CHECK:       # %bb.0:
     52 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
     53 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
     54 ; CHECK-NEXT:    retq
     55   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
     56   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
     57   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
     58   ret <16 x i8> %res
     59 }
     60 define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
     61 ; CHECK-LABEL: test_masked_16xi8_perm_mask2:
     62 ; CHECK:       # %bb.0:
     63 ; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
     64 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
     65 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
     66 ; CHECK-NEXT:    retq
     67   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
     68   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
     69   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
     70   ret <16 x i8> %res
     71 }
     72 
     73 define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
     74 ; CHECK-LABEL: test_masked_z_16xi8_perm_mask2:
     75 ; CHECK:       # %bb.0:
     76 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
     77 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
     78 ; CHECK-NEXT:    retq
     79   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
     80   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
     81   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
     82   ret <16 x i8> %res
     83 }
     84 define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
     85 ; CHECK-LABEL: test_16xi8_perm_mask3:
     86 ; CHECK:       # %bb.0:
     87 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
     88 ; CHECK-NEXT:    retq
     89   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
     90   ret <16 x i8> %res
     91 }
     92 define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
     93 ; CHECK-LABEL: test_masked_16xi8_perm_mask3:
     94 ; CHECK:       # %bb.0:
     95 ; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
     96 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
     97 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
     98 ; CHECK-NEXT:    retq
     99   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
    100   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    101   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
    102   ret <16 x i8> %res
    103 }
    104 
    105 define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
    106 ; CHECK-LABEL: test_masked_z_16xi8_perm_mask3:
    107 ; CHECK:       # %bb.0:
    108 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
    109 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
    110 ; CHECK-NEXT:    retq
    111   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
    112   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    113   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
    114   ret <16 x i8> %res
    115 }
    116 define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
    117 ; CHECK-LABEL: test_16xi8_perm_mem_mask0:
    118 ; CHECK:       # %bb.0:
    119 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
    120 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
    121 ; CHECK-NEXT:    retq
    122   %vec = load <16 x i8>, <16 x i8>* %vp
    123   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
    124   ret <16 x i8> %res
    125 }
    126 define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
    127 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0:
    128 ; CHECK:       # %bb.0:
    129 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
    130 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
    131 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
    132 ; CHECK-NEXT:    retq
    133   %vec = load <16 x i8>, <16 x i8>* %vp
    134   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
    135   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    136   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
    137   ret <16 x i8> %res
    138 }
    139 
    140 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) {
    141 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0:
    142 ; CHECK:       # %bb.0:
    143 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
    144 ; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
    145 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
    146 ; CHECK-NEXT:    retq
    147   %vec = load <16 x i8>, <16 x i8>* %vp
    148   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
    149   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    150   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
    151   ret <16 x i8> %res
    152 }
    153 
    154 define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
    155 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1:
    156 ; CHECK:       # %bb.0:
    157 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
    158 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
    159 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
    160 ; CHECK-NEXT:    retq
    161   %vec = load <16 x i8>, <16 x i8>* %vp
    162   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
    163   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    164   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
    165   ret <16 x i8> %res
    166 }
    167 
    168 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) {
    169 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1:
    170 ; CHECK:       # %bb.0:
    171 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
    172 ; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
    173 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
    174 ; CHECK-NEXT:    retq
    175   %vec = load <16 x i8>, <16 x i8>* %vp
    176   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
    177   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    178   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
    179   ret <16 x i8> %res
    180 }
    181 
    182 define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
    183 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2:
    184 ; CHECK:       # %bb.0:
    185 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
    186 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
    187 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
    188 ; CHECK-NEXT:    retq
    189   %vec = load <16 x i8>, <16 x i8>* %vp
    190   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
    191   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    192   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
    193   ret <16 x i8> %res
    194 }
    195 
    196 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) {
    197 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2:
    198 ; CHECK:       # %bb.0:
    199 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
    200 ; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
    201 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
    202 ; CHECK-NEXT:    retq
    203   %vec = load <16 x i8>, <16 x i8>* %vp
    204   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
    205   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    206   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
    207   ret <16 x i8> %res
    208 }
    209 
    210 define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
    211 ; CHECK-LABEL: test_16xi8_perm_mem_mask3:
    212 ; CHECK:       # %bb.0:
    213 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
    214 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
    215 ; CHECK-NEXT:    retq
    216   %vec = load <16 x i8>, <16 x i8>* %vp
    217   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
    218   ret <16 x i8> %res
    219 }
    220 define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
    221 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3:
    222 ; CHECK:       # %bb.0:
    223 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
    224 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
    225 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
    226 ; CHECK-NEXT:    retq
    227   %vec = load <16 x i8>, <16 x i8>* %vp
    228   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
    229   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    230   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
    231   ret <16 x i8> %res
    232 }
    233 
    234 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) {
    235 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3:
    236 ; CHECK:       # %bb.0:
    237 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
    238 ; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
    239 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
    240 ; CHECK-NEXT:    retq
    241   %vec = load <16 x i8>, <16 x i8>* %vp
    242   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
    243   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
    244   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
    245   ret <16 x i8> %res
    246 }
    247 
    248 define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
    249 ; CHECK-LABEL: test_32xi8_perm_mask0:
    250 ; CHECK:       # %bb.0:
    251 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
    252 ; CHECK-NEXT:    retq
    253   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
    254   ret <32 x i8> %res
    255 }
    256 define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
    257 ; CHECK-LABEL: test_masked_32xi8_perm_mask0:
    258 ; CHECK:       # %bb.0:
    259 ; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
    260 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
    261 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
    262 ; CHECK-NEXT:    retq
    263   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
    264   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    265   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
    266   ret <32 x i8> %res
    267 }
    268 
    269 define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
    270 ; CHECK-LABEL: test_masked_z_32xi8_perm_mask0:
    271 ; CHECK:       # %bb.0:
    272 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
    273 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
    274 ; CHECK-NEXT:    retq
    275   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
    276   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    277   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
    278   ret <32 x i8> %res
    279 }
    280 define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
    281 ; CHECK-LABEL: test_masked_32xi8_perm_mask1:
    282 ; CHECK:       # %bb.0:
    283 ; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
    284 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
    285 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
    286 ; CHECK-NEXT:    retq
    287   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
    288   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    289   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
    290   ret <32 x i8> %res
    291 }
    292 
    293 define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
    294 ; CHECK-LABEL: test_masked_z_32xi8_perm_mask1:
    295 ; CHECK:       # %bb.0:
    296 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
    297 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
    298 ; CHECK-NEXT:    retq
    299   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
    300   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    301   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
    302   ret <32 x i8> %res
    303 }
    304 define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
    305 ; CHECK-LABEL: test_masked_32xi8_perm_mask2:
    306 ; CHECK:       # %bb.0:
    307 ; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
    308 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
    309 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
    310 ; CHECK-NEXT:    retq
    311   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
    312   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    313   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
    314   ret <32 x i8> %res
    315 }
    316 
    317 define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
    318 ; CHECK-LABEL: test_masked_z_32xi8_perm_mask2:
    319 ; CHECK:       # %bb.0:
    320 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
    321 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
    322 ; CHECK-NEXT:    retq
    323   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
    324   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    325   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
    326   ret <32 x i8> %res
    327 }
    328 define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
    329 ; CHECK-LABEL: test_32xi8_perm_mask3:
    330 ; CHECK:       # %bb.0:
    331 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
    332 ; CHECK-NEXT:    retq
    333   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
    334   ret <32 x i8> %res
    335 }
    336 define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
    337 ; CHECK-LABEL: test_masked_32xi8_perm_mask3:
    338 ; CHECK:       # %bb.0:
    339 ; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
    340 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
    341 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
    342 ; CHECK-NEXT:    retq
    343   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
    344   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    345   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
    346   ret <32 x i8> %res
    347 }
    348 
    349 define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
    350 ; CHECK-LABEL: test_masked_z_32xi8_perm_mask3:
    351 ; CHECK:       # %bb.0:
    352 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
    353 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
    354 ; CHECK-NEXT:    retq
    355   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
    356   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    357   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
    358   ret <32 x i8> %res
    359 }
    360 define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
    361 ; CHECK-LABEL: test_32xi8_perm_mem_mask0:
    362 ; CHECK:       # %bb.0:
    363 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
    364 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
    365 ; CHECK-NEXT:    retq
    366   %vec = load <32 x i8>, <32 x i8>* %vp
    367   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
    368   ret <32 x i8> %res
    369 }
    370 define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
    371 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0:
    372 ; CHECK:       # %bb.0:
    373 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
    374 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
    375 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
    376 ; CHECK-NEXT:    retq
    377   %vec = load <32 x i8>, <32 x i8>* %vp
    378   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
    379   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    380   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
    381   ret <32 x i8> %res
    382 }
    383 
    384 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) {
    385 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0:
    386 ; CHECK:       # %bb.0:
    387 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
    388 ; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
    389 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
    390 ; CHECK-NEXT:    retq
    391   %vec = load <32 x i8>, <32 x i8>* %vp
    392   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
    393   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    394   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
    395   ret <32 x i8> %res
    396 }
    397 
    398 define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
    399 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1:
    400 ; CHECK:       # %bb.0:
    401 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
    402 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
    403 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
    404 ; CHECK-NEXT:    retq
    405   %vec = load <32 x i8>, <32 x i8>* %vp
    406   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
    407   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    408   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
    409   ret <32 x i8> %res
    410 }
    411 
    412 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) {
    413 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1:
    414 ; CHECK:       # %bb.0:
    415 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
    416 ; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
    417 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
    418 ; CHECK-NEXT:    retq
    419   %vec = load <32 x i8>, <32 x i8>* %vp
    420   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
    421   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    422   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
    423   ret <32 x i8> %res
    424 }
    425 
    426 define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
    427 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2:
    428 ; CHECK:       # %bb.0:
    429 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
    430 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
    431 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
    432 ; CHECK-NEXT:    retq
    433   %vec = load <32 x i8>, <32 x i8>* %vp
    434   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
    435   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    436   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
    437   ret <32 x i8> %res
    438 }
    439 
    440 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) {
    441 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2:
    442 ; CHECK:       # %bb.0:
    443 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
    444 ; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
    445 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
    446 ; CHECK-NEXT:    retq
    447   %vec = load <32 x i8>, <32 x i8>* %vp
    448   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
    449   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    450   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
    451   ret <32 x i8> %res
    452 }
    453 
    454 define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
    455 ; CHECK-LABEL: test_32xi8_perm_mem_mask3:
    456 ; CHECK:       # %bb.0:
    457 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
    458 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
    459 ; CHECK-NEXT:    retq
    460   %vec = load <32 x i8>, <32 x i8>* %vp
    461   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
    462   ret <32 x i8> %res
    463 }
    464 define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
    465 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3:
    466 ; CHECK:       # %bb.0:
    467 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
    468 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
    469 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
    470 ; CHECK-NEXT:    retq
    471   %vec = load <32 x i8>, <32 x i8>* %vp
    472   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
    473   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    474   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
    475   ret <32 x i8> %res
    476 }
    477 
    478 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) {
    479 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3:
    480 ; CHECK:       # %bb.0:
    481 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
    482 ; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
    483 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
    484 ; CHECK-NEXT:    retq
    485   %vec = load <32 x i8>, <32 x i8>* %vp
    486   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
    487   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
    488   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
    489   ret <32 x i8> %res
    490 }
    491 
    492 define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
    493 ; CHECK-LABEL: test_64xi8_perm_mask0:
    494 ; CHECK:       # %bb.0:
    495 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
    496 ; CHECK-NEXT:    retq
    497   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
    498   ret <64 x i8> %res
    499 }
    500 define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
    501 ; CHECK-LABEL: test_masked_64xi8_perm_mask0:
    502 ; CHECK:       # %bb.0:
    503 ; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
    504 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
    505 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    506 ; CHECK-NEXT:    retq
    507   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
    508   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    509   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
    510   ret <64 x i8> %res
    511 }
    512 
    513 define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
    514 ; CHECK-LABEL: test_masked_z_64xi8_perm_mask0:
    515 ; CHECK:       # %bb.0:
    516 ; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
    517 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
    518 ; CHECK-NEXT:    retq
    519   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
    520   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    521   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
    522   ret <64 x i8> %res
    523 }
    524 define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
    525 ; CHECK-LABEL: test_masked_64xi8_perm_mask1:
    526 ; CHECK:       # %bb.0:
    527 ; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
    528 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
    529 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    530 ; CHECK-NEXT:    retq
    531   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
    532   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    533   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
    534   ret <64 x i8> %res
    535 }
    536 
    537 define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
    538 ; CHECK-LABEL: test_masked_z_64xi8_perm_mask1:
    539 ; CHECK:       # %bb.0:
    540 ; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
    541 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
    542 ; CHECK-NEXT:    retq
    543   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
    544   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    545   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
    546   ret <64 x i8> %res
    547 }
    548 define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
    549 ; CHECK-LABEL: test_masked_64xi8_perm_mask2:
    550 ; CHECK:       # %bb.0:
    551 ; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
    552 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
    553 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    554 ; CHECK-NEXT:    retq
    555   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
    556   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    557   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
    558   ret <64 x i8> %res
    559 }
    560 
    561 define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
    562 ; CHECK-LABEL: test_masked_z_64xi8_perm_mask2:
    563 ; CHECK:       # %bb.0:
    564 ; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
    565 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
    566 ; CHECK-NEXT:    retq
    567   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
    568   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    569   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
    570   ret <64 x i8> %res
    571 }
    572 define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
    573 ; CHECK-LABEL: test_64xi8_perm_mask3:
    574 ; CHECK:       # %bb.0:
    575 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
    576 ; CHECK-NEXT:    retq
    577   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
    578   ret <64 x i8> %res
    579 }
    580 define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
    581 ; CHECK-LABEL: test_masked_64xi8_perm_mask3:
    582 ; CHECK:       # %bb.0:
    583 ; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
    584 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
    585 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
    586 ; CHECK-NEXT:    retq
    587   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
    588   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    589   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
    590   ret <64 x i8> %res
    591 }
    592 
    593 define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
    594 ; CHECK-LABEL: test_masked_z_64xi8_perm_mask3:
    595 ; CHECK:       # %bb.0:
    596 ; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
    597 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
    598 ; CHECK-NEXT:    retq
    599   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
    600   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    601   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
    602   ret <64 x i8> %res
    603 }
    604 define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
    605 ; CHECK-LABEL: test_64xi8_perm_mem_mask0:
    606 ; CHECK:       # %bb.0:
    607 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
    608 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
    609 ; CHECK-NEXT:    retq
    610   %vec = load <64 x i8>, <64 x i8>* %vp
    611   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
    612   ret <64 x i8> %res
    613 }
    614 define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
    615 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0:
    616 ; CHECK:       # %bb.0:
    617 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
    618 ; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
    619 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
    620 ; CHECK-NEXT:    retq
    621   %vec = load <64 x i8>, <64 x i8>* %vp
    622   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
    623   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    624   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
    625   ret <64 x i8> %res
    626 }
    627 
    628 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
    629 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0:
    630 ; CHECK:       # %bb.0:
    631 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
    632 ; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
    633 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
    634 ; CHECK-NEXT:    retq
    635   %vec = load <64 x i8>, <64 x i8>* %vp
    636   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
    637   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    638   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
    639   ret <64 x i8> %res
    640 }
    641 
    642 define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
    643 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1:
    644 ; CHECK:       # %bb.0:
    645 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
    646 ; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
    647 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
    648 ; CHECK-NEXT:    retq
    649   %vec = load <64 x i8>, <64 x i8>* %vp
    650   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
    651   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    652   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
    653   ret <64 x i8> %res
    654 }
    655 
    656 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
    657 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1:
    658 ; CHECK:       # %bb.0:
    659 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
    660 ; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
    661 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
    662 ; CHECK-NEXT:    retq
    663   %vec = load <64 x i8>, <64 x i8>* %vp
    664   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
    665   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    666   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
    667   ret <64 x i8> %res
    668 }
    669 
    670 define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
    671 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2:
    672 ; CHECK:       # %bb.0:
    673 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
    674 ; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
    675 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
    676 ; CHECK-NEXT:    retq
    677   %vec = load <64 x i8>, <64 x i8>* %vp
    678   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
    679   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    680   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
    681   ret <64 x i8> %res
    682 }
    683 
    684 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
    685 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2:
    686 ; CHECK:       # %bb.0:
    687 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
    688 ; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
    689 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
    690 ; CHECK-NEXT:    retq
    691   %vec = load <64 x i8>, <64 x i8>* %vp
    692   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
    693   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    694   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
    695   ret <64 x i8> %res
    696 }
    697 
    698 define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
    699 ; CHECK-LABEL: test_64xi8_perm_mem_mask3:
    700 ; CHECK:       # %bb.0:
    701 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
    702 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
    703 ; CHECK-NEXT:    retq
    704   %vec = load <64 x i8>, <64 x i8>* %vp
    705   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
    706   ret <64 x i8> %res
    707 }
    708 define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
    709 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3:
    710 ; CHECK:       # %bb.0:
    711 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
    712 ; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
    713 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
    714 ; CHECK-NEXT:    retq
    715   %vec = load <64 x i8>, <64 x i8>* %vp
    716   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
    717   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    718   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
    719   ret <64 x i8> %res
    720 }
    721 
    722 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
    723 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3:
    724 ; CHECK:       # %bb.0:
    725 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
    726 ; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
    727 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
    728 ; CHECK-NEXT:    retq
    729   %vec = load <64 x i8>, <64 x i8>* %vp
    730   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
    731   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
    732   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
    733   ret <64 x i8> %res
    734 }
    735 
    736 define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
    737 ; CHECK-LABEL: test_8xi16_perm_high_mask0:
    738 ; CHECK:       # %bb.0:
    739 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6]
    740 ; CHECK-NEXT:    retq
    741   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
    742   ret <8 x i16> %res
    743 }
    744 define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
    745 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask0:
    746 ; CHECK:       # %bb.0:
    747 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
    748 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6]
    749 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
    750 ; CHECK-NEXT:    retq
    751   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
    752   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    753   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    754   ret <8 x i16> %res
    755 }
    756 
    757 define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
    758 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0:
    759 ; CHECK:       # %bb.0:
    760 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    761 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6]
    762 ; CHECK-NEXT:    retq
    763   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
    764   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    765   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    766   ret <8 x i16> %res
    767 }
    768 define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
    769 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask1:
    770 ; CHECK:       # %bb.0:
    771 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
    772 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7]
    773 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
    774 ; CHECK-NEXT:    retq
    775   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
    776   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    777   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    778   ret <8 x i16> %res
    779 }
    780 
    781 define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
    782 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1:
    783 ; CHECK:       # %bb.0:
    784 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    785 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7]
    786 ; CHECK-NEXT:    retq
    787   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
    788   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    789   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    790   ret <8 x i16> %res
    791 }
    792 define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
    793 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask2:
    794 ; CHECK:       # %bb.0:
    795 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
    796 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5]
    797 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
    798 ; CHECK-NEXT:    retq
    799   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
    800   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    801   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    802   ret <8 x i16> %res
    803 }
    804 
    805 define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
    806 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2:
    807 ; CHECK:       # %bb.0:
    808 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    809 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5]
    810 ; CHECK-NEXT:    retq
    811   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
    812   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    813   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    814   ret <8 x i16> %res
    815 }
    816 define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
    817 ; CHECK-LABEL: test_8xi16_perm_low_mask3:
    818 ; CHECK:       # %bb.0:
    819 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7]
    820 ; CHECK-NEXT:    retq
    821   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
    822   ret <8 x i16> %res
    823 }
    824 define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
    825 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask3:
    826 ; CHECK:       # %bb.0:
    827 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
    828 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7]
    829 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
    830 ; CHECK-NEXT:    retq
    831   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
    832   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    833   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    834   ret <8 x i16> %res
    835 }
    836 
    837 define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
    838 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3:
    839 ; CHECK:       # %bb.0:
    840 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    841 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7]
    842 ; CHECK-NEXT:    retq
    843   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
    844   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    845   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    846   ret <8 x i16> %res
    847 }
    848 define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
    849 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask4:
    850 ; CHECK:       # %bb.0:
    851 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
    852 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6]
    853 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
    854 ; CHECK-NEXT:    retq
    855   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
    856   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    857   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    858   ret <8 x i16> %res
    859 }
    860 
    861 define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
    862 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4:
    863 ; CHECK:       # %bb.0:
    864 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    865 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6]
    866 ; CHECK-NEXT:    retq
    867   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
    868   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    869   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    870   ret <8 x i16> %res
    871 }
    872 define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
    873 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask5:
    874 ; CHECK:       # %bb.0:
    875 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
    876 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7]
    877 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
    878 ; CHECK-NEXT:    retq
    879   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
    880   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    881   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    882   ret <8 x i16> %res
    883 }
    884 
    885 define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
    886 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5:
    887 ; CHECK:       # %bb.0:
    888 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    889 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7]
    890 ; CHECK-NEXT:    retq
    891   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
    892   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    893   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    894   ret <8 x i16> %res
    895 }
    896 define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
    897 ; CHECK-LABEL: test_8xi16_perm_high_mask6:
    898 ; CHECK:       # %bb.0:
    899 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5]
    900 ; CHECK-NEXT:    retq
    901   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
    902   ret <8 x i16> %res
    903 }
    904 define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
    905 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask6:
    906 ; CHECK:       # %bb.0:
    907 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
    908 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5]
    909 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
    910 ; CHECK-NEXT:    retq
    911   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
    912   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    913   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    914   ret <8 x i16> %res
    915 }
    916 
    917 define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
    918 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6:
    919 ; CHECK:       # %bb.0:
    920 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    921 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5]
    922 ; CHECK-NEXT:    retq
    923   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
    924   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    925   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    926   ret <8 x i16> %res
    927 }
    928 define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
    929 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask7:
    930 ; CHECK:       # %bb.0:
    931 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
    932 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7]
    933 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
    934 ; CHECK-NEXT:    retq
    935   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
    936   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    937   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    938   ret <8 x i16> %res
    939 }
    940 
    941 define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
    942 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7:
    943 ; CHECK:       # %bb.0:
    944 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    945 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7]
    946 ; CHECK-NEXT:    retq
    947   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
    948   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    949   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    950   ret <8 x i16> %res
    951 }
    952 define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
    953 ; CHECK-LABEL: test_8xi16_perm_high_mem_mask0:
    954 ; CHECK:       # %bb.0:
    955 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6]
    956 ; CHECK-NEXT:    retq
    957   %vec = load <8 x i16>, <8 x i16>* %vp
    958   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
    959   ret <8 x i16> %res
    960 }
    961 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
    962 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0:
    963 ; CHECK:       # %bb.0:
    964 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    965 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6]
    966 ; CHECK-NEXT:    retq
    967   %vec = load <8 x i16>, <8 x i16>* %vp
    968   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
    969   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    970   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    971   ret <8 x i16> %res
    972 }
    973 
    974 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
    975 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
    976 ; CHECK:       # %bb.0:
    977 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
    978 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6]
    979 ; CHECK-NEXT:    retq
    980   %vec = load <8 x i16>, <8 x i16>* %vp
    981   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
    982   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    983   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
    984   ret <8 x i16> %res
    985 }
    986 
    987 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
    988 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1:
    989 ; CHECK:       # %bb.0:
    990 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
    991 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7]
    992 ; CHECK-NEXT:    retq
    993   %vec = load <8 x i16>, <8 x i16>* %vp
    994   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
    995   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
    996   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
    997   ret <8 x i16> %res
    998 }
    999 
   1000 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
   1001 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
   1002 ; CHECK:       # %bb.0:
   1003 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
   1004 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7]
   1005 ; CHECK-NEXT:    retq
   1006   %vec = load <8 x i16>, <8 x i16>* %vp
   1007   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   1008   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1009   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   1010   ret <8 x i16> %res
   1011 }
   1012 
   1013 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   1014 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2:
   1015 ; CHECK:       # %bb.0:
   1016 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
   1017 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7]
   1018 ; CHECK-NEXT:    retq
   1019   %vec = load <8 x i16>, <8 x i16>* %vp
   1020   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
   1021   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1022   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   1023   ret <8 x i16> %res
   1024 }
   1025 
   1026 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
   1027 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
   1028 ; CHECK:       # %bb.0:
   1029 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
   1030 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7]
   1031 ; CHECK-NEXT:    retq
   1032   %vec = load <8 x i16>, <8 x i16>* %vp
   1033   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
   1034   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1035   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   1036   ret <8 x i16> %res
   1037 }
   1038 
   1039 define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
   1040 ; CHECK-LABEL: test_8xi16_perm_low_mem_mask3:
   1041 ; CHECK:       # %bb.0:
   1042 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7]
   1043 ; CHECK-NEXT:    retq
   1044   %vec = load <8 x i16>, <8 x i16>* %vp
   1045   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   1046   ret <8 x i16> %res
   1047 }
   1048 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   1049 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3:
   1050 ; CHECK:       # %bb.0:
   1051 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
   1052 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7]
   1053 ; CHECK-NEXT:    retq
   1054   %vec = load <8 x i16>, <8 x i16>* %vp
   1055   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   1056   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1057   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   1058   ret <8 x i16> %res
   1059 }
   1060 
   1061 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
   1062 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
   1063 ; CHECK:       # %bb.0:
   1064 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
   1065 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7]
   1066 ; CHECK-NEXT:    retq
   1067   %vec = load <8 x i16>, <8 x i16>* %vp
   1068   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   1069   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1070   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   1071   ret <8 x i16> %res
   1072 }
   1073 
   1074 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   1075 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4:
   1076 ; CHECK:       # %bb.0:
   1077 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
   1078 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5]
   1079 ; CHECK-NEXT:    retq
   1080   %vec = load <8 x i16>, <8 x i16>* %vp
   1081   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
   1082   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1083   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   1084   ret <8 x i16> %res
   1085 }
   1086 
   1087 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
   1088 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
   1089 ; CHECK:       # %bb.0:
   1090 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
   1091 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5]
   1092 ; CHECK-NEXT:    retq
   1093   %vec = load <8 x i16>, <8 x i16>* %vp
   1094   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
   1095   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1096   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   1097   ret <8 x i16> %res
   1098 }
   1099 
   1100 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   1101 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5:
   1102 ; CHECK:       # %bb.0:
   1103 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
   1104 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7]
   1105 ; CHECK-NEXT:    retq
   1106   %vec = load <8 x i16>, <8 x i16>* %vp
   1107   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   1108   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1109   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   1110   ret <8 x i16> %res
   1111 }
   1112 
   1113 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
   1114 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
   1115 ; CHECK:       # %bb.0:
   1116 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
   1117 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7]
   1118 ; CHECK-NEXT:    retq
   1119   %vec = load <8 x i16>, <8 x i16>* %vp
   1120   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   1121   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1122   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   1123   ret <8 x i16> %res
   1124 }
   1125 
   1126 define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
   1127 ; CHECK-LABEL: test_8xi16_perm_high_mem_mask6:
   1128 ; CHECK:       # %bb.0:
   1129 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4]
   1130 ; CHECK-NEXT:    retq
   1131   %vec = load <8 x i16>, <8 x i16>* %vp
   1132   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   1133   ret <8 x i16> %res
   1134 }
   1135 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   1136 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6:
   1137 ; CHECK:       # %bb.0:
   1138 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
   1139 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4]
   1140 ; CHECK-NEXT:    retq
   1141   %vec = load <8 x i16>, <8 x i16>* %vp
   1142   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   1143   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1144   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   1145   ret <8 x i16> %res
   1146 }
   1147 
   1148 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
   1149 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
   1150 ; CHECK:       # %bb.0:
   1151 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
   1152 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4]
   1153 ; CHECK-NEXT:    retq
   1154   %vec = load <8 x i16>, <8 x i16>* %vp
   1155   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   1156   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1157   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   1158   ret <8 x i16> %res
   1159 }
   1160 
   1161 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   1162 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7:
   1163 ; CHECK:       # %bb.0:
   1164 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
   1165 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7]
   1166 ; CHECK-NEXT:    retq
   1167   %vec = load <8 x i16>, <8 x i16>* %vp
   1168   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
   1169   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1170   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   1171   ret <8 x i16> %res
   1172 }
   1173 
   1174 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
   1175 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
   1176 ; CHECK:       # %bb.0:
   1177 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
   1178 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7]
   1179 ; CHECK-NEXT:    retq
   1180   %vec = load <8 x i16>, <8 x i16>* %vp
   1181   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
   1182   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   1183   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   1184   ret <8 x i16> %res
   1185 }
   1186 
   1187 define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
   1188 ; CHECK-LABEL: test_16xi16_perm_high_mask0:
   1189 ; CHECK:       # %bb.0:
   1190 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
   1191 ; CHECK-NEXT:    retq
   1192   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   1193   ret <16 x i16> %res
   1194 }
   1195 define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   1196 ; CHECK-LABEL: test_masked_16xi16_perm_high_mask0:
   1197 ; CHECK:       # %bb.0:
   1198 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
   1199 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
   1200 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1201 ; CHECK-NEXT:    retq
   1202   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   1203   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1204   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1205   ret <16 x i16> %res
   1206 }
   1207 
   1208 define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
   1209 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0:
   1210 ; CHECK:       # %bb.0:
   1211 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1212 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
   1213 ; CHECK-NEXT:    retq
   1214   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   1215   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1216   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1217   ret <16 x i16> %res
   1218 }
   1219 define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   1220 ; CHECK-LABEL: test_masked_16xi16_perm_low_mask1:
   1221 ; CHECK:       # %bb.0:
   1222 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
   1223 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
   1224 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1225 ; CHECK-NEXT:    retq
   1226   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1227   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1228   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1229   ret <16 x i16> %res
   1230 }
   1231 
   1232 define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
   1233 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1:
   1234 ; CHECK:       # %bb.0:
   1235 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1236 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
   1237 ; CHECK-NEXT:    retq
   1238   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1239   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1240   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1241   ret <16 x i16> %res
   1242 }
   1243 define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   1244 ; CHECK-LABEL: test_masked_16xi16_perm_high_mask2:
   1245 ; CHECK:       # %bb.0:
   1246 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
   1247 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
   1248 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1249 ; CHECK-NEXT:    retq
   1250   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   1251   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1252   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1253   ret <16 x i16> %res
   1254 }
   1255 
   1256 define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
   1257 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2:
   1258 ; CHECK:       # %bb.0:
   1259 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1260 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
   1261 ; CHECK-NEXT:    retq
   1262   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   1263   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1264   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1265   ret <16 x i16> %res
   1266 }
   1267 define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
   1268 ; CHECK-LABEL: test_16xi16_perm_low_mask3:
   1269 ; CHECK:       # %bb.0:
   1270 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
   1271 ; CHECK-NEXT:    retq
   1272   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1273   ret <16 x i16> %res
   1274 }
   1275 define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   1276 ; CHECK-LABEL: test_masked_16xi16_perm_low_mask3:
   1277 ; CHECK:       # %bb.0:
   1278 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
   1279 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
   1280 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1281 ; CHECK-NEXT:    retq
   1282   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1283   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1284   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1285   ret <16 x i16> %res
   1286 }
   1287 
   1288 define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
   1289 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3:
   1290 ; CHECK:       # %bb.0:
   1291 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1292 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
   1293 ; CHECK-NEXT:    retq
   1294   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1295   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1296   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1297   ret <16 x i16> %res
   1298 }
   1299 define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   1300 ; CHECK-LABEL: test_masked_16xi16_perm_high_mask4:
   1301 ; CHECK:       # %bb.0:
   1302 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
   1303 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
   1304 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1305 ; CHECK-NEXT:    retq
   1306   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   1307   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1308   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1309   ret <16 x i16> %res
   1310 }
   1311 
   1312 define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
   1313 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4:
   1314 ; CHECK:       # %bb.0:
   1315 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1316 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
   1317 ; CHECK-NEXT:    retq
   1318   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   1319   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1320   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1321   ret <16 x i16> %res
   1322 }
   1323 define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   1324 ; CHECK-LABEL: test_masked_16xi16_perm_low_mask5:
   1325 ; CHECK:       # %bb.0:
   1326 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
   1327 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
   1328 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1329 ; CHECK-NEXT:    retq
   1330   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   1331   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1332   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1333   ret <16 x i16> %res
   1334 }
   1335 
   1336 define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
   1337 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5:
   1338 ; CHECK:       # %bb.0:
   1339 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1340 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
   1341 ; CHECK-NEXT:    retq
   1342   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   1343   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1344   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1345   ret <16 x i16> %res
   1346 }
   1347 define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
   1348 ; CHECK-LABEL: test_16xi16_perm_high_mask6:
   1349 ; CHECK:       # %bb.0:
   1350 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
   1351 ; CHECK-NEXT:    retq
   1352   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   1353   ret <16 x i16> %res
   1354 }
   1355 define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   1356 ; CHECK-LABEL: test_masked_16xi16_perm_high_mask6:
   1357 ; CHECK:       # %bb.0:
   1358 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
   1359 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
   1360 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1361 ; CHECK-NEXT:    retq
   1362   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   1363   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1364   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1365   ret <16 x i16> %res
   1366 }
   1367 
   1368 define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
   1369 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6:
   1370 ; CHECK:       # %bb.0:
   1371 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1372 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
   1373 ; CHECK-NEXT:    retq
   1374   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   1375   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1376   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1377   ret <16 x i16> %res
   1378 }
   1379 define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   1380 ; CHECK-LABEL: test_masked_16xi16_perm_low_mask7:
   1381 ; CHECK:       # %bb.0:
   1382 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
   1383 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
   1384 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   1385 ; CHECK-NEXT:    retq
   1386   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   1387   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1388   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1389   ret <16 x i16> %res
   1390 }
   1391 
   1392 define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
   1393 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7:
   1394 ; CHECK:       # %bb.0:
   1395 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1396 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
   1397 ; CHECK-NEXT:    retq
   1398   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   1399   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1400   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1401   ret <16 x i16> %res
   1402 }
   1403 define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
   1404 ; CHECK-LABEL: test_16xi16_perm_high_mem_mask0:
   1405 ; CHECK:       # %bb.0:
   1406 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
   1407 ; CHECK-NEXT:    retq
   1408   %vec = load <16 x i16>, <16 x i16>* %vp
   1409   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   1410   ret <16 x i16> %res
   1411 }
   1412 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   1413 ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0:
   1414 ; CHECK:       # %bb.0:
   1415 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1416 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
   1417 ; CHECK-NEXT:    retq
   1418   %vec = load <16 x i16>, <16 x i16>* %vp
   1419   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   1420   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1421   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1422   ret <16 x i16> %res
   1423 }
   1424 
   1425 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
   1426 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
   1427 ; CHECK:       # %bb.0:
   1428 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
   1429 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
   1430 ; CHECK-NEXT:    retq
   1431   %vec = load <16 x i16>, <16 x i16>* %vp
   1432   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   1433   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1434   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1435   ret <16 x i16> %res
   1436 }
   1437 
   1438 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   1439 ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1:
   1440 ; CHECK:       # %bb.0:
   1441 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1442 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
   1443 ; CHECK-NEXT:    retq
   1444   %vec = load <16 x i16>, <16 x i16>* %vp
   1445   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   1446   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1447   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1448   ret <16 x i16> %res
   1449 }
   1450 
   1451 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
   1452 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
   1453 ; CHECK:       # %bb.0:
   1454 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
   1455 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
   1456 ; CHECK-NEXT:    retq
   1457   %vec = load <16 x i16>, <16 x i16>* %vp
   1458   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   1459   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1460   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1461   ret <16 x i16> %res
   1462 }
   1463 
   1464 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   1465 ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2:
   1466 ; CHECK:       # %bb.0:
   1467 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1468 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
   1469 ; CHECK-NEXT:    retq
   1470   %vec = load <16 x i16>, <16 x i16>* %vp
   1471   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
   1472   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1473   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1474   ret <16 x i16> %res
   1475 }
   1476 
   1477 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
   1478 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
   1479 ; CHECK:       # %bb.0:
   1480 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
   1481 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
   1482 ; CHECK-NEXT:    retq
   1483   %vec = load <16 x i16>, <16 x i16>* %vp
   1484   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
   1485   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1486   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1487   ret <16 x i16> %res
   1488 }
   1489 
   1490 define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
   1491 ; CHECK-LABEL: test_16xi16_perm_low_mem_mask3:
   1492 ; CHECK:       # %bb.0:
   1493 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
   1494 ; CHECK-NEXT:    retq
   1495   %vec = load <16 x i16>, <16 x i16>* %vp
   1496   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   1497   ret <16 x i16> %res
   1498 }
   1499 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   1500 ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3:
   1501 ; CHECK:       # %bb.0:
   1502 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1503 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
   1504 ; CHECK-NEXT:    retq
   1505   %vec = load <16 x i16>, <16 x i16>* %vp
   1506   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   1507   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1508   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1509   ret <16 x i16> %res
   1510 }
   1511 
   1512 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
   1513 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
   1514 ; CHECK:       # %bb.0:
   1515 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
   1516 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
   1517 ; CHECK-NEXT:    retq
   1518   %vec = load <16 x i16>, <16 x i16>* %vp
   1519   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   1520   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1521   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1522   ret <16 x i16> %res
   1523 }
   1524 
   1525 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   1526 ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4:
   1527 ; CHECK:       # %bb.0:
   1528 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1529 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
   1530 ; CHECK-NEXT:    retq
   1531   %vec = load <16 x i16>, <16 x i16>* %vp
   1532   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
   1533   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1534   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1535   ret <16 x i16> %res
   1536 }
   1537 
   1538 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
   1539 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
   1540 ; CHECK:       # %bb.0:
   1541 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
   1542 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
   1543 ; CHECK-NEXT:    retq
   1544   %vec = load <16 x i16>, <16 x i16>* %vp
   1545   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
   1546   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1547   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1548   ret <16 x i16> %res
   1549 }
   1550 
   1551 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   1552 ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5:
   1553 ; CHECK:       # %bb.0:
   1554 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1555 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
   1556 ; CHECK-NEXT:    retq
   1557   %vec = load <16 x i16>, <16 x i16>* %vp
   1558   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1559   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1560   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1561   ret <16 x i16> %res
   1562 }
   1563 
   1564 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
   1565 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
   1566 ; CHECK:       # %bb.0:
   1567 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
   1568 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
   1569 ; CHECK-NEXT:    retq
   1570   %vec = load <16 x i16>, <16 x i16>* %vp
   1571   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1572   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1573   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1574   ret <16 x i16> %res
   1575 }
   1576 
   1577 define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
   1578 ; CHECK-LABEL: test_16xi16_perm_high_mem_mask6:
   1579 ; CHECK:       # %bb.0:
   1580 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
   1581 ; CHECK-NEXT:    retq
   1582   %vec = load <16 x i16>, <16 x i16>* %vp
   1583   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   1584   ret <16 x i16> %res
   1585 }
   1586 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   1587 ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6:
   1588 ; CHECK:       # %bb.0:
   1589 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1590 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
   1591 ; CHECK-NEXT:    retq
   1592   %vec = load <16 x i16>, <16 x i16>* %vp
   1593   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   1594   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1595   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1596   ret <16 x i16> %res
   1597 }
   1598 
   1599 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
   1600 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
   1601 ; CHECK:       # %bb.0:
   1602 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
   1603 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
   1604 ; CHECK-NEXT:    retq
   1605   %vec = load <16 x i16>, <16 x i16>* %vp
   1606   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   1607   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1608   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1609   ret <16 x i16> %res
   1610 }
   1611 
   1612 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   1613 ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7:
   1614 ; CHECK:       # %bb.0:
   1615 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
   1616 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
   1617 ; CHECK-NEXT:    retq
   1618   %vec = load <16 x i16>, <16 x i16>* %vp
   1619   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1620   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1621   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   1622   ret <16 x i16> %res
   1623 }
   1624 
   1625 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
   1626 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
   1627 ; CHECK:       # %bb.0:
   1628 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
   1629 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
   1630 ; CHECK-NEXT:    retq
   1631   %vec = load <16 x i16>, <16 x i16>* %vp
   1632   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   1633   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   1634   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   1635   ret <16 x i16> %res
   1636 }
   1637 
   1638 define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
   1639 ; CHECK-LABEL: test_32xi16_perm_high_mask0:
   1640 ; CHECK:       # %bb.0:
   1641 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
   1642 ; CHECK-NEXT:    retq
   1643   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   1644   ret <32 x i16> %res
   1645 }
   1646 define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   1647 ; CHECK-LABEL: test_masked_32xi16_perm_high_mask0:
   1648 ; CHECK:       # %bb.0:
   1649 ; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
   1650 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
   1651 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1652 ; CHECK-NEXT:    retq
   1653   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   1654   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1655   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1656   ret <32 x i16> %res
   1657 }
   1658 
   1659 define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
   1660 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0:
   1661 ; CHECK:       # %bb.0:
   1662 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1663 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
   1664 ; CHECK-NEXT:    retq
   1665   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   1666   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1667   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1668   ret <32 x i16> %res
   1669 }
   1670 define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   1671 ; CHECK-LABEL: test_masked_32xi16_perm_low_mask1:
   1672 ; CHECK:       # %bb.0:
   1673 ; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
   1674 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
   1675 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1676 ; CHECK-NEXT:    retq
   1677   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   1678   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1679   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1680   ret <32 x i16> %res
   1681 }
   1682 
   1683 define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
   1684 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1:
   1685 ; CHECK:       # %bb.0:
   1686 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1687 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
   1688 ; CHECK-NEXT:    retq
   1689   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   1690   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1691   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1692   ret <32 x i16> %res
   1693 }
   1694 define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   1695 ; CHECK-LABEL: test_masked_32xi16_perm_high_mask2:
   1696 ; CHECK:       # %bb.0:
   1697 ; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
   1698 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
   1699 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1700 ; CHECK-NEXT:    retq
   1701   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   1702   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1703   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1704   ret <32 x i16> %res
   1705 }
   1706 
   1707 define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
   1708 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2:
   1709 ; CHECK:       # %bb.0:
   1710 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1711 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
   1712 ; CHECK-NEXT:    retq
   1713   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   1714   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1715   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1716   ret <32 x i16> %res
   1717 }
   1718 define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
   1719 ; CHECK-LABEL: test_32xi16_perm_low_mask3:
   1720 ; CHECK:       # %bb.0:
   1721 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
   1722 ; CHECK-NEXT:    retq
   1723   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   1724   ret <32 x i16> %res
   1725 }
   1726 define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   1727 ; CHECK-LABEL: test_masked_32xi16_perm_low_mask3:
   1728 ; CHECK:       # %bb.0:
   1729 ; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
   1730 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
   1731 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1732 ; CHECK-NEXT:    retq
   1733   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   1734   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1735   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1736   ret <32 x i16> %res
   1737 }
   1738 
   1739 define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
   1740 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3:
   1741 ; CHECK:       # %bb.0:
   1742 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1743 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
   1744 ; CHECK-NEXT:    retq
   1745   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   1746   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1747   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1748   ret <32 x i16> %res
   1749 }
   1750 define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   1751 ; CHECK-LABEL: test_masked_32xi16_perm_high_mask4:
   1752 ; CHECK:       # %bb.0:
   1753 ; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
   1754 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
   1755 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1756 ; CHECK-NEXT:    retq
   1757   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   1758   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1759   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1760   ret <32 x i16> %res
   1761 }
   1762 
   1763 define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
   1764 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4:
   1765 ; CHECK:       # %bb.0:
   1766 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1767 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
   1768 ; CHECK-NEXT:    retq
   1769   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   1770   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1771   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1772   ret <32 x i16> %res
   1773 }
   1774 define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   1775 ; CHECK-LABEL: test_masked_32xi16_perm_low_mask5:
   1776 ; CHECK:       # %bb.0:
   1777 ; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
   1778 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
   1779 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1780 ; CHECK-NEXT:    retq
   1781   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   1782   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1783   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1784   ret <32 x i16> %res
   1785 }
   1786 
   1787 define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
   1788 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5:
   1789 ; CHECK:       # %bb.0:
   1790 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1791 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
   1792 ; CHECK-NEXT:    retq
   1793   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   1794   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1795   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1796   ret <32 x i16> %res
   1797 }
   1798 define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
   1799 ; CHECK-LABEL: test_32xi16_perm_high_mask6:
   1800 ; CHECK:       # %bb.0:
   1801 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
   1802 ; CHECK-NEXT:    retq
   1803   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   1804   ret <32 x i16> %res
   1805 }
   1806 define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   1807 ; CHECK-LABEL: test_masked_32xi16_perm_high_mask6:
   1808 ; CHECK:       # %bb.0:
   1809 ; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
   1810 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
   1811 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1812 ; CHECK-NEXT:    retq
   1813   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   1814   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1815   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1816   ret <32 x i16> %res
   1817 }
   1818 
   1819 define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
   1820 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6:
   1821 ; CHECK:       # %bb.0:
   1822 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1823 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
   1824 ; CHECK-NEXT:    retq
   1825   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   1826   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1827   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1828   ret <32 x i16> %res
   1829 }
   1830 define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   1831 ; CHECK-LABEL: test_masked_32xi16_perm_low_mask7:
   1832 ; CHECK:       # %bb.0:
   1833 ; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
   1834 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
   1835 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   1836 ; CHECK-NEXT:    retq
   1837   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   1838   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1839   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1840   ret <32 x i16> %res
   1841 }
   1842 
   1843 define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
   1844 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7:
   1845 ; CHECK:       # %bb.0:
   1846 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1847 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
   1848 ; CHECK-NEXT:    retq
   1849   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   1850   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1851   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1852   ret <32 x i16> %res
   1853 }
   1854 define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
   1855 ; CHECK-LABEL: test_32xi16_perm_high_mem_mask0:
   1856 ; CHECK:       # %bb.0:
   1857 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
   1858 ; CHECK-NEXT:    retq
   1859   %vec = load <32 x i16>, <32 x i16>* %vp
   1860   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   1861   ret <32 x i16> %res
   1862 }
   1863 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   1864 ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0:
   1865 ; CHECK:       # %bb.0:
   1866 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1867 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
   1868 ; CHECK-NEXT:    retq
   1869   %vec = load <32 x i16>, <32 x i16>* %vp
   1870   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   1871   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1872   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1873   ret <32 x i16> %res
   1874 }
   1875 
   1876 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
   1877 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
   1878 ; CHECK:       # %bb.0:
   1879 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
   1880 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
   1881 ; CHECK-NEXT:    retq
   1882   %vec = load <32 x i16>, <32 x i16>* %vp
   1883   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   1884   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1885   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1886   ret <32 x i16> %res
   1887 }
   1888 
   1889 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   1890 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1:
   1891 ; CHECK:       # %bb.0:
   1892 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1893 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
   1894 ; CHECK-NEXT:    retq
   1895   %vec = load <32 x i16>, <32 x i16>* %vp
   1896   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
   1897   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1898   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1899   ret <32 x i16> %res
   1900 }
   1901 
   1902 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
   1903 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
   1904 ; CHECK:       # %bb.0:
   1905 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
   1906 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
   1907 ; CHECK-NEXT:    retq
   1908   %vec = load <32 x i16>, <32 x i16>* %vp
   1909   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
   1910   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1911   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1912   ret <32 x i16> %res
   1913 }
   1914 
   1915 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   1916 ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2:
   1917 ; CHECK:       # %bb.0:
   1918 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1919 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
   1920 ; CHECK-NEXT:    retq
   1921   %vec = load <32 x i16>, <32 x i16>* %vp
   1922   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
   1923   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1924   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1925   ret <32 x i16> %res
   1926 }
   1927 
   1928 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
   1929 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
   1930 ; CHECK:       # %bb.0:
   1931 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
   1932 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
   1933 ; CHECK-NEXT:    retq
   1934   %vec = load <32 x i16>, <32 x i16>* %vp
   1935   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
   1936   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1937   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1938   ret <32 x i16> %res
   1939 }
   1940 
   1941 define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
   1942 ; CHECK-LABEL: test_32xi16_perm_low_mem_mask3:
   1943 ; CHECK:       # %bb.0:
   1944 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
   1945 ; CHECK-NEXT:    retq
   1946   %vec = load <32 x i16>, <32 x i16>* %vp
   1947   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   1948   ret <32 x i16> %res
   1949 }
   1950 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   1951 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3:
   1952 ; CHECK:       # %bb.0:
   1953 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1954 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
   1955 ; CHECK-NEXT:    retq
   1956   %vec = load <32 x i16>, <32 x i16>* %vp
   1957   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   1958   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1959   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1960   ret <32 x i16> %res
   1961 }
   1962 
   1963 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
   1964 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
   1965 ; CHECK:       # %bb.0:
   1966 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
   1967 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
   1968 ; CHECK-NEXT:    retq
   1969   %vec = load <32 x i16>, <32 x i16>* %vp
   1970   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   1971   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1972   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1973   ret <32 x i16> %res
   1974 }
   1975 
   1976 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   1977 ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4:
   1978 ; CHECK:       # %bb.0:
   1979 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   1980 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
   1981 ; CHECK-NEXT:    retq
   1982   %vec = load <32 x i16>, <32 x i16>* %vp
   1983   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
   1984   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1985   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   1986   ret <32 x i16> %res
   1987 }
   1988 
   1989 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
   1990 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
   1991 ; CHECK:       # %bb.0:
   1992 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
   1993 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
   1994 ; CHECK-NEXT:    retq
   1995   %vec = load <32 x i16>, <32 x i16>* %vp
   1996   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
   1997   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   1998   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   1999   ret <32 x i16> %res
   2000 }
   2001 
   2002 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   2003 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5:
   2004 ; CHECK:       # %bb.0:
   2005 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
   2006 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   2007 ; CHECK-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1}
   2008 ; CHECK-NEXT:    retq
   2009   %vec = load <32 x i16>, <32 x i16>* %vp
   2010   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
   2011   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   2012   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   2013   ret <32 x i16> %res
   2014 }
   2015 
   2016 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
   2017 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
   2018 ; CHECK:       # %bb.0:
   2019 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
   2020 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
   2021 ; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z}
   2022 ; CHECK-NEXT:    retq
   2023   %vec = load <32 x i16>, <32 x i16>* %vp
   2024   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
   2025   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   2026   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   2027   ret <32 x i16> %res
   2028 }
   2029 
   2030 define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
   2031 ; CHECK-LABEL: test_32xi16_perm_high_mem_mask6:
   2032 ; CHECK:       # %bb.0:
   2033 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
   2034 ; CHECK-NEXT:    retq
   2035   %vec = load <32 x i16>, <32 x i16>* %vp
   2036   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   2037   ret <32 x i16> %res
   2038 }
   2039 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   2040 ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6:
   2041 ; CHECK:       # %bb.0:
   2042 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   2043 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
   2044 ; CHECK-NEXT:    retq
   2045   %vec = load <32 x i16>, <32 x i16>* %vp
   2046   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   2047   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   2048   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   2049   ret <32 x i16> %res
   2050 }
   2051 
   2052 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
   2053 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
   2054 ; CHECK:       # %bb.0:
   2055 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
   2056 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
   2057 ; CHECK-NEXT:    retq
   2058   %vec = load <32 x i16>, <32 x i16>* %vp
   2059   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   2060   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   2061   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   2062   ret <32 x i16> %res
   2063 }
   2064 
   2065 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   2066 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7:
   2067 ; CHECK:       # %bb.0:
   2068 ; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
   2069 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
   2070 ; CHECK-NEXT:    retq
   2071   %vec = load <32 x i16>, <32 x i16>* %vp
   2072   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   2073   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   2074   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   2075   ret <32 x i16> %res
   2076 }
   2077 
   2078 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
   2079 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
   2080 ; CHECK:       # %bb.0:
   2081 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
   2082 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
   2083 ; CHECK-NEXT:    retq
   2084   %vec = load <32 x i16>, <32 x i16>* %vp
   2085   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   2086   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   2087   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   2088   ret <32 x i16> %res
   2089 }
   2090 
   2091 define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
   2092 ; CHECK-LABEL: test_4xi32_perm_mask0:
   2093 ; CHECK:       # %bb.0:
   2094 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0]
   2095 ; CHECK-NEXT:    retq
   2096   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   2097   ret <4 x i32> %res
   2098 }
   2099 define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
   2100 ; CHECK-LABEL: test_masked_4xi32_perm_mask0:
   2101 ; CHECK:       # %bb.0:
   2102 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
   2103 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0]
   2104 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
   2105 ; CHECK-NEXT:    retq
   2106   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   2107   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2108   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   2109   ret <4 x i32> %res
   2110 }
   2111 
   2112 define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
   2113 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask0:
   2114 ; CHECK:       # %bb.0:
   2115 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
   2116 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0]
   2117 ; CHECK-NEXT:    retq
   2118   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   2119   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2120   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   2121   ret <4 x i32> %res
   2122 }
   2123 define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
   2124 ; CHECK-LABEL: test_masked_4xi32_perm_mask1:
   2125 ; CHECK:       # %bb.0:
   2126 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
   2127 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0]
   2128 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
   2129 ; CHECK-NEXT:    retq
   2130   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   2131   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2132   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   2133   ret <4 x i32> %res
   2134 }
   2135 
   2136 define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
   2137 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask1:
   2138 ; CHECK:       # %bb.0:
   2139 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
   2140 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0]
   2141 ; CHECK-NEXT:    retq
   2142   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   2143   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2144   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   2145   ret <4 x i32> %res
   2146 }
   2147 define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
   2148 ; CHECK-LABEL: test_masked_4xi32_perm_mask2:
   2149 ; CHECK:       # %bb.0:
   2150 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
   2151 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0]
   2152 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
   2153 ; CHECK-NEXT:    retq
   2154   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   2155   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2156   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   2157   ret <4 x i32> %res
   2158 }
   2159 
   2160 define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
   2161 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask2:
   2162 ; CHECK:       # %bb.0:
   2163 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
   2164 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0]
   2165 ; CHECK-NEXT:    retq
   2166   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   2167   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2168   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   2169   ret <4 x i32> %res
   2170 }
   2171 define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
   2172 ; CHECK-LABEL: test_4xi32_perm_mask3:
   2173 ; CHECK:       # %bb.0:
   2174 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3]
   2175 ; CHECK-NEXT:    retq
   2176   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   2177   ret <4 x i32> %res
   2178 }
   2179 define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
   2180 ; CHECK-LABEL: test_masked_4xi32_perm_mask3:
   2181 ; CHECK:       # %bb.0:
   2182 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
   2183 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3]
   2184 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
   2185 ; CHECK-NEXT:    retq
   2186   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   2187   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2188   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   2189   ret <4 x i32> %res
   2190 }
   2191 
   2192 define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
   2193 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask3:
   2194 ; CHECK:       # %bb.0:
   2195 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
   2196 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3]
   2197 ; CHECK-NEXT:    retq
   2198   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   2199   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2200   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   2201   ret <4 x i32> %res
   2202 }
   2203 define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
   2204 ; CHECK-LABEL: test_4xi32_perm_mem_mask0:
   2205 ; CHECK:       # %bb.0:
   2206 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3]
   2207 ; CHECK-NEXT:    retq
   2208   %vec = load <4 x i32>, <4 x i32>* %vp
   2209   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   2210   ret <4 x i32> %res
   2211 }
   2212 define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
   2213 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0:
   2214 ; CHECK:       # %bb.0:
   2215 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
   2216 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3]
   2217 ; CHECK-NEXT:    retq
   2218   %vec = load <4 x i32>, <4 x i32>* %vp
   2219   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   2220   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2221   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   2222   ret <4 x i32> %res
   2223 }
   2224 
   2225 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
   2226 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0:
   2227 ; CHECK:       # %bb.0:
   2228 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
   2229 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3]
   2230 ; CHECK-NEXT:    retq
   2231   %vec = load <4 x i32>, <4 x i32>* %vp
   2232   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   2233   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2234   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   2235   ret <4 x i32> %res
   2236 }
   2237 
   2238 define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
   2239 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1:
   2240 ; CHECK:       # %bb.0:
   2241 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
   2242 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1]
   2243 ; CHECK-NEXT:    retq
   2244   %vec = load <4 x i32>, <4 x i32>* %vp
   2245   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
   2246   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2247   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   2248   ret <4 x i32> %res
   2249 }
   2250 
   2251 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
   2252 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1:
   2253 ; CHECK:       # %bb.0:
   2254 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
   2255 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1]
   2256 ; CHECK-NEXT:    retq
   2257   %vec = load <4 x i32>, <4 x i32>* %vp
   2258   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
   2259   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2260   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   2261   ret <4 x i32> %res
   2262 }
   2263 
   2264 define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
   2265 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2:
   2266 ; CHECK:       # %bb.0:
   2267 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
   2268 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1]
   2269 ; CHECK-NEXT:    retq
   2270   %vec = load <4 x i32>, <4 x i32>* %vp
   2271   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
   2272   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2273   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   2274   ret <4 x i32> %res
   2275 }
   2276 
   2277 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
   2278 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2:
   2279 ; CHECK:       # %bb.0:
   2280 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
   2281 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1]
   2282 ; CHECK-NEXT:    retq
   2283   %vec = load <4 x i32>, <4 x i32>* %vp
   2284   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
   2285   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2286   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   2287   ret <4 x i32> %res
   2288 }
   2289 
   2290 define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
   2291 ; CHECK-LABEL: test_4xi32_perm_mem_mask3:
   2292 ; CHECK:       # %bb.0:
   2293 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0]
   2294 ; CHECK-NEXT:    retq
   2295   %vec = load <4 x i32>, <4 x i32>* %vp
   2296   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   2297   ret <4 x i32> %res
   2298 }
   2299 define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
   2300 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3:
   2301 ; CHECK:       # %bb.0:
   2302 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
   2303 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0]
   2304 ; CHECK-NEXT:    retq
   2305   %vec = load <4 x i32>, <4 x i32>* %vp
   2306   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   2307   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2308   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   2309   ret <4 x i32> %res
   2310 }
   2311 
   2312 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
   2313 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3:
   2314 ; CHECK:       # %bb.0:
   2315 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
   2316 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0]
   2317 ; CHECK-NEXT:    retq
   2318   %vec = load <4 x i32>, <4 x i32>* %vp
   2319   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   2320   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   2321   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   2322   ret <4 x i32> %res
   2323 }
   2324 
   2325 define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
   2326 ; CHECK-LABEL: test_8xi32_perm_mask0:
   2327 ; CHECK:       # %bb.0:
   2328 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4]
   2329 ; CHECK-NEXT:    retq
   2330   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   2331   ret <8 x i32> %res
   2332 }
   2333 define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
   2334 ; CHECK-LABEL: test_masked_8xi32_perm_mask0:
   2335 ; CHECK:       # %bb.0:
   2336 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   2337 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4]
   2338 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   2339 ; CHECK-NEXT:    retq
   2340   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   2341   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2342   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   2343   ret <8 x i32> %res
   2344 }
   2345 
   2346 define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
   2347 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
   2348 ; CHECK:       # %bb.0:
   2349 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   2350 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4]
   2351 ; CHECK-NEXT:    retq
   2352   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   2353   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2354   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   2355   ret <8 x i32> %res
   2356 }
   2357 define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
   2358 ; CHECK-LABEL: test_masked_8xi32_perm_mask1:
   2359 ; CHECK:       # %bb.0:
   2360 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   2361 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7]
   2362 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   2363 ; CHECK-NEXT:    retq
   2364   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   2365   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2366   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   2367   ret <8 x i32> %res
   2368 }
   2369 
   2370 define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
   2371 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
   2372 ; CHECK:       # %bb.0:
   2373 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   2374 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7]
   2375 ; CHECK-NEXT:    retq
   2376   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   2377   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2378   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   2379   ret <8 x i32> %res
   2380 }
   2381 define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
   2382 ; CHECK-LABEL: test_masked_8xi32_perm_mask2:
   2383 ; CHECK:       # %bb.0:
   2384 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   2385 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7]
   2386 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   2387 ; CHECK-NEXT:    retq
   2388   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   2389   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2390   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   2391   ret <8 x i32> %res
   2392 }
   2393 
   2394 define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
   2395 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
   2396 ; CHECK:       # %bb.0:
   2397 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   2398 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7]
   2399 ; CHECK-NEXT:    retq
   2400   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   2401   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2402   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   2403   ret <8 x i32> %res
   2404 }
   2405 define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
   2406 ; CHECK-LABEL: test_8xi32_perm_mask3:
   2407 ; CHECK:       # %bb.0:
   2408 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4]
   2409 ; CHECK-NEXT:    retq
   2410   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   2411   ret <8 x i32> %res
   2412 }
   2413 define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
   2414 ; CHECK-LABEL: test_masked_8xi32_perm_mask3:
   2415 ; CHECK:       # %bb.0:
   2416 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
   2417 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4]
   2418 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
   2419 ; CHECK-NEXT:    retq
   2420   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   2421   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2422   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   2423   ret <8 x i32> %res
   2424 }
   2425 
   2426 define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
   2427 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
   2428 ; CHECK:       # %bb.0:
   2429 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   2430 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4]
   2431 ; CHECK-NEXT:    retq
   2432   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   2433   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2434   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   2435   ret <8 x i32> %res
   2436 }
   2437 define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
   2438 ; CHECK-LABEL: test_8xi32_perm_mem_mask0:
   2439 ; CHECK:       # %bb.0:
   2440 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4]
   2441 ; CHECK-NEXT:    retq
   2442   %vec = load <8 x i32>, <8 x i32>* %vp
   2443   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   2444   ret <8 x i32> %res
   2445 }
   2446 define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   2447 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
   2448 ; CHECK:       # %bb.0:
   2449 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   2450 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4]
   2451 ; CHECK-NEXT:    retq
   2452   %vec = load <8 x i32>, <8 x i32>* %vp
   2453   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   2454   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2455   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   2456   ret <8 x i32> %res
   2457 }
   2458 
   2459 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
   2460 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
   2461 ; CHECK:       # %bb.0:
   2462 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
   2463 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4]
   2464 ; CHECK-NEXT:    retq
   2465   %vec = load <8 x i32>, <8 x i32>* %vp
   2466   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   2467   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2468   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   2469   ret <8 x i32> %res
   2470 }
   2471 
   2472 define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   2473 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
   2474 ; CHECK:       # %bb.0:
   2475 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   2476 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4]
   2477 ; CHECK-NEXT:    retq
   2478   %vec = load <8 x i32>, <8 x i32>* %vp
   2479   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   2480   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2481   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   2482   ret <8 x i32> %res
   2483 }
   2484 
   2485 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
   2486 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
   2487 ; CHECK:       # %bb.0:
   2488 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
   2489 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
   2490 ; CHECK-NEXT:    retq
   2491   %vec = load <8 x i32>, <8 x i32>* %vp
   2492   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   2493   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2494   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   2495   ret <8 x i32> %res
   2496 }
   2497 
   2498 define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   2499 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
   2500 ; CHECK:       # %bb.0:
   2501 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   2502 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5]
   2503 ; CHECK-NEXT:    retq
   2504   %vec = load <8 x i32>, <8 x i32>* %vp
   2505   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
   2506   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2507   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   2508   ret <8 x i32> %res
   2509 }
   2510 
   2511 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
   2512 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
   2513 ; CHECK:       # %bb.0:
   2514 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
   2515 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5]
   2516 ; CHECK-NEXT:    retq
   2517   %vec = load <8 x i32>, <8 x i32>* %vp
   2518   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
   2519   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2520   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   2521   ret <8 x i32> %res
   2522 }
   2523 
   2524 define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
   2525 ; CHECK-LABEL: test_8xi32_perm_mem_mask3:
   2526 ; CHECK:       # %bb.0:
   2527 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4]
   2528 ; CHECK-NEXT:    retq
   2529   %vec = load <8 x i32>, <8 x i32>* %vp
   2530   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   2531   ret <8 x i32> %res
   2532 }
   2533 define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   2534 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
   2535 ; CHECK:       # %bb.0:
   2536 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
   2537 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4]
   2538 ; CHECK-NEXT:    retq
   2539   %vec = load <8 x i32>, <8 x i32>* %vp
   2540   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   2541   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2542   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   2543   ret <8 x i32> %res
   2544 }
   2545 
   2546 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
   2547 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
   2548 ; CHECK:       # %bb.0:
   2549 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
   2550 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4]
   2551 ; CHECK-NEXT:    retq
   2552   %vec = load <8 x i32>, <8 x i32>* %vp
   2553   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   2554   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2555   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   2556   ret <8 x i32> %res
   2557 }
   2558 
   2559 define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
   2560 ; CHECK-LABEL: test_16xi32_perm_mask0:
   2561 ; CHECK:       # %bb.0:
   2562 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
   2563 ; CHECK-NEXT:    retq
   2564   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   2565   ret <16 x i32> %res
   2566 }
   2567 define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   2568 ; CHECK-LABEL: test_masked_16xi32_perm_mask0:
   2569 ; CHECK:       # %bb.0:
   2570 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   2571 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
   2572 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   2573 ; CHECK-NEXT:    retq
   2574   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   2575   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2576   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   2577   ret <16 x i32> %res
   2578 }
   2579 
   2580 define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
   2581 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
   2582 ; CHECK:       # %bb.0:
   2583 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   2584 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
   2585 ; CHECK-NEXT:    retq
   2586   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   2587   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2588   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   2589   ret <16 x i32> %res
   2590 }
   2591 define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   2592 ; CHECK-LABEL: test_masked_16xi32_perm_mask1:
   2593 ; CHECK:       # %bb.0:
   2594 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   2595 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
   2596 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   2597 ; CHECK-NEXT:    retq
   2598   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   2599   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2600   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   2601   ret <16 x i32> %res
   2602 }
   2603 
   2604 define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
   2605 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
   2606 ; CHECK:       # %bb.0:
   2607 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   2608 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
   2609 ; CHECK-NEXT:    retq
   2610   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   2611   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2612   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   2613   ret <16 x i32> %res
   2614 }
   2615 define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   2616 ; CHECK-LABEL: test_masked_16xi32_perm_mask2:
   2617 ; CHECK:       # %bb.0:
   2618 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   2619 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
   2620 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   2621 ; CHECK-NEXT:    retq
   2622   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   2623   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2624   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   2625   ret <16 x i32> %res
   2626 }
   2627 
   2628 define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
   2629 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
   2630 ; CHECK:       # %bb.0:
   2631 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   2632 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
   2633 ; CHECK-NEXT:    retq
   2634   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   2635   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2636   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   2637   ret <16 x i32> %res
   2638 }
   2639 define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
   2640 ; CHECK-LABEL: test_16xi32_perm_mask3:
   2641 ; CHECK:       # %bb.0:
   2642 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
   2643 ; CHECK-NEXT:    retq
   2644   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   2645   ret <16 x i32> %res
   2646 }
   2647 define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   2648 ; CHECK-LABEL: test_masked_16xi32_perm_mask3:
   2649 ; CHECK:       # %bb.0:
   2650 ; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
   2651 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
   2652 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
   2653 ; CHECK-NEXT:    retq
   2654   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   2655   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2656   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   2657   ret <16 x i32> %res
   2658 }
   2659 
   2660 define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
   2661 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
   2662 ; CHECK:       # %bb.0:
   2663 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   2664 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
   2665 ; CHECK-NEXT:    retq
   2666   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   2667   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2668   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   2669   ret <16 x i32> %res
   2670 }
   2671 define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
   2672 ; CHECK-LABEL: test_16xi32_perm_mem_mask0:
   2673 ; CHECK:       # %bb.0:
   2674 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
   2675 ; CHECK-NEXT:    retq
   2676   %vec = load <16 x i32>, <16 x i32>* %vp
   2677   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   2678   ret <16 x i32> %res
   2679 }
   2680 define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   2681 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
   2682 ; CHECK:       # %bb.0:
   2683 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   2684 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
   2685 ; CHECK-NEXT:    retq
   2686   %vec = load <16 x i32>, <16 x i32>* %vp
   2687   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   2688   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2689   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   2690   ret <16 x i32> %res
   2691 }
   2692 
   2693 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
   2694 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
   2695 ; CHECK:       # %bb.0:
   2696 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
   2697 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
   2698 ; CHECK-NEXT:    retq
   2699   %vec = load <16 x i32>, <16 x i32>* %vp
   2700   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   2701   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2702   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   2703   ret <16 x i32> %res
   2704 }
   2705 
   2706 define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   2707 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
   2708 ; CHECK:       # %bb.0:
   2709 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   2710 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
   2711 ; CHECK-NEXT:    retq
   2712   %vec = load <16 x i32>, <16 x i32>* %vp
   2713   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
   2714   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2715   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   2716   ret <16 x i32> %res
   2717 }
   2718 
   2719 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
   2720 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
   2721 ; CHECK:       # %bb.0:
   2722 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
   2723 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
   2724 ; CHECK-NEXT:    retq
   2725   %vec = load <16 x i32>, <16 x i32>* %vp
   2726   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
   2727   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2728   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   2729   ret <16 x i32> %res
   2730 }
   2731 
   2732 define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   2733 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
   2734 ; CHECK:       # %bb.0:
   2735 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   2736 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
   2737 ; CHECK-NEXT:    retq
   2738   %vec = load <16 x i32>, <16 x i32>* %vp
   2739   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
   2740   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2741   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   2742   ret <16 x i32> %res
   2743 }
   2744 
   2745 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
   2746 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
   2747 ; CHECK:       # %bb.0:
   2748 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
   2749 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
   2750 ; CHECK-NEXT:    retq
   2751   %vec = load <16 x i32>, <16 x i32>* %vp
   2752   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
   2753   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2754   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   2755   ret <16 x i32> %res
   2756 }
   2757 
   2758 define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
   2759 ; CHECK-LABEL: test_16xi32_perm_mem_mask3:
   2760 ; CHECK:       # %bb.0:
   2761 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
   2762 ; CHECK-NEXT:    retq
   2763   %vec = load <16 x i32>, <16 x i32>* %vp
   2764   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   2765   ret <16 x i32> %res
   2766 }
   2767 define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   2768 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
   2769 ; CHECK:       # %bb.0:
   2770 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
   2771 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
   2772 ; CHECK-NEXT:    retq
   2773   %vec = load <16 x i32>, <16 x i32>* %vp
   2774   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   2775   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2776   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   2777   ret <16 x i32> %res
   2778 }
   2779 
   2780 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
   2781 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
   2782 ; CHECK:       # %bb.0:
   2783 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
   2784 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
   2785 ; CHECK-NEXT:    retq
   2786   %vec = load <16 x i32>, <16 x i32>* %vp
   2787   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   2788   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   2789   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   2790   ret <16 x i32> %res
   2791 }
   2792 
   2793