Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
      4 
      5 ; This test is an assembly of avx512 shuffling instructions to check their scheduling
      6 
      7 define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
      8 ; GENERIC-LABEL: test_16xi16_perm_mask0:
      9 ; GENERIC:       # %bb.0:
     10 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
     11 ; GENERIC-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
     12 ; GENERIC-NEXT:    retq # sched: [1:1.00]
     13 ;
     14 ; SKX-LABEL: test_16xi16_perm_mask0:
     15 ; SKX:       # %bb.0:
     16 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
     17 ; SKX-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
     18 ; SKX-NEXT:    retq # sched: [7:1.00]
     19   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
     20   ret <16 x i16> %res
     21 }
     22 define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
     23 ; GENERIC-LABEL: test_masked_16xi16_perm_mask0:
     24 ; GENERIC:       # %bb.0:
     25 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
     26 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
     27 ; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
     28 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
     29 ; GENERIC-NEXT:    retq # sched: [1:1.00]
     30 ;
     31 ; SKX-LABEL: test_masked_16xi16_perm_mask0:
     32 ; SKX:       # %bb.0:
     33 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
     34 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
     35 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
     36 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
     37 ; SKX-NEXT:    retq # sched: [7:1.00]
     38   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
     39   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
     40   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
     41   ret <16 x i16> %res
     42 }
     43 
     44 define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
     45 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0:
     46 ; GENERIC:       # %bb.0:
     47 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
     48 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
     49 ; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
     50 ; GENERIC-NEXT:    retq # sched: [1:1.00]
     51 ;
     52 ; SKX-LABEL: test_masked_z_16xi16_perm_mask0:
     53 ; SKX:       # %bb.0:
     54 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
     55 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
     56 ; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
     57 ; SKX-NEXT:    retq # sched: [7:1.00]
     58   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
     59   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
     60   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
     61   ret <16 x i16> %res
     62 }
     63 define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
     64 ; GENERIC-LABEL: test_masked_16xi16_perm_mask1:
     65 ; GENERIC:       # %bb.0:
     66 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
     67 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
     68 ; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
     69 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
     70 ; GENERIC-NEXT:    retq # sched: [1:1.00]
     71 ;
     72 ; SKX-LABEL: test_masked_16xi16_perm_mask1:
     73 ; SKX:       # %bb.0:
     74 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
     75 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
     76 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
     77 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
     78 ; SKX-NEXT:    retq # sched: [7:1.00]
     79   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
     80   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
     81   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
     82   ret <16 x i16> %res
     83 }
     84 
     85 define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
     86 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1:
     87 ; GENERIC:       # %bb.0:
     88 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
     89 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
     90 ; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
     91 ; GENERIC-NEXT:    retq # sched: [1:1.00]
     92 ;
     93 ; SKX-LABEL: test_masked_z_16xi16_perm_mask1:
     94 ; SKX:       # %bb.0:
     95 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
     96 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
     97 ; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
     98 ; SKX-NEXT:    retq # sched: [7:1.00]
     99   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
    100   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    101   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
    102   ret <16 x i16> %res
    103 }
    104 define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
    105 ; GENERIC-LABEL: test_masked_16xi16_perm_mask2:
    106 ; GENERIC:       # %bb.0:
    107 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
    108 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
    109 ; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
    110 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
    111 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    112 ;
    113 ; SKX-LABEL: test_masked_16xi16_perm_mask2:
    114 ; SKX:       # %bb.0:
    115 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
    116 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
    117 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
    118 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
    119 ; SKX-NEXT:    retq # sched: [7:1.00]
    120   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
    121   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    122   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
    123   ret <16 x i16> %res
    124 }
    125 
    126 define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
    127 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2:
    128 ; GENERIC:       # %bb.0:
    129 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
    130 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
    131 ; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
    132 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    133 ;
    134 ; SKX-LABEL: test_masked_z_16xi16_perm_mask2:
    135 ; SKX:       # %bb.0:
    136 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
    137 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
    138 ; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
    139 ; SKX-NEXT:    retq # sched: [7:1.00]
    140   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
    141   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    142   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
    143   ret <16 x i16> %res
    144 }
    145 define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
    146 ; GENERIC-LABEL: test_16xi16_perm_mask3:
    147 ; GENERIC:       # %bb.0:
    148 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
    149 ; GENERIC-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
    150 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    151 ;
    152 ; SKX-LABEL: test_16xi16_perm_mask3:
    153 ; SKX:       # %bb.0:
    154 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
    155 ; SKX-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
    156 ; SKX-NEXT:    retq # sched: [7:1.00]
    157   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
    158   ret <16 x i16> %res
    159 }
    160 define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
    161 ; GENERIC-LABEL: test_masked_16xi16_perm_mask3:
    162 ; GENERIC:       # %bb.0:
    163 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
    164 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
    165 ; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
    166 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
    167 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    168 ;
    169 ; SKX-LABEL: test_masked_16xi16_perm_mask3:
    170 ; SKX:       # %bb.0:
    171 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
    172 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
    173 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
    174 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
    175 ; SKX-NEXT:    retq # sched: [7:1.00]
    176   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
    177   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    178   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
    179   ret <16 x i16> %res
    180 }
    181 
    182 define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
    183 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3:
    184 ; GENERIC:       # %bb.0:
    185 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
    186 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
    187 ; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
    188 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    189 ;
    190 ; SKX-LABEL: test_masked_z_16xi16_perm_mask3:
    191 ; SKX:       # %bb.0:
    192 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
    193 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
    194 ; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
    195 ; SKX-NEXT:    retq # sched: [7:1.00]
    196   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
    197   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    198   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
    199   ret <16 x i16> %res
    200 }
    201 define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
    202 ; GENERIC-LABEL: test_16xi16_perm_mem_mask0:
    203 ; GENERIC:       # %bb.0:
    204 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
    205 ; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    206 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    207 ;
    208 ; SKX-LABEL: test_16xi16_perm_mem_mask0:
    209 ; SKX:       # %bb.0:
    210 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
    211 ; SKX-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
    212 ; SKX-NEXT:    retq # sched: [7:1.00]
    213   %vec = load <16 x i16>, <16 x i16>* %vp
    214   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
    215   ret <16 x i16> %res
    216 }
    217 define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
    218 ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0:
    219 ; GENERIC:       # %bb.0:
    220 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
    221 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
    222 ; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
    223 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    224 ;
    225 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
    226 ; SKX:       # %bb.0:
    227 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
    228 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
    229 ; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
    230 ; SKX-NEXT:    retq # sched: [7:1.00]
    231   %vec = load <16 x i16>, <16 x i16>* %vp
    232   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
    233   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    234   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
    235   ret <16 x i16> %res
    236 }
    237 
    238 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
    239 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0:
    240 ; GENERIC:       # %bb.0:
    241 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
    242 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
    243 ; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
    244 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    245 ;
    246 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
    247 ; SKX:       # %bb.0:
    248 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
    249 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
    250 ; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
    251 ; SKX-NEXT:    retq # sched: [7:1.00]
    252   %vec = load <16 x i16>, <16 x i16>* %vp
    253   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
    254   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    255   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
    256   ret <16 x i16> %res
    257 }
    258 
    259 define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
    260 ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1:
    261 ; GENERIC:       # %bb.0:
    262 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
    263 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
    264 ; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
    265 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    266 ;
    267 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
    268 ; SKX:       # %bb.0:
    269 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
    270 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
    271 ; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
    272 ; SKX-NEXT:    retq # sched: [7:1.00]
    273   %vec = load <16 x i16>, <16 x i16>* %vp
    274   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
    275   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    276   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
    277   ret <16 x i16> %res
    278 }
    279 
    280 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
    281 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1:
    282 ; GENERIC:       # %bb.0:
    283 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
    284 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
    285 ; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
    286 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    287 ;
    288 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
    289 ; SKX:       # %bb.0:
    290 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
    291 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
    292 ; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
    293 ; SKX-NEXT:    retq # sched: [7:1.00]
    294   %vec = load <16 x i16>, <16 x i16>* %vp
    295   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
    296   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    297   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
    298   ret <16 x i16> %res
    299 }
    300 
    301 define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
    302 ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2:
    303 ; GENERIC:       # %bb.0:
    304 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
    305 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
    306 ; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
    307 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    308 ;
    309 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
    310 ; SKX:       # %bb.0:
    311 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
    312 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
    313 ; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
    314 ; SKX-NEXT:    retq # sched: [7:1.00]
    315   %vec = load <16 x i16>, <16 x i16>* %vp
    316   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
    317   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    318   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
    319   ret <16 x i16> %res
    320 }
    321 
    322 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
    323 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2:
    324 ; GENERIC:       # %bb.0:
    325 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
    326 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
    327 ; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
    328 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    329 ;
    330 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
    331 ; SKX:       # %bb.0:
    332 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
    333 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
    334 ; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
    335 ; SKX-NEXT:    retq # sched: [7:1.00]
    336   %vec = load <16 x i16>, <16 x i16>* %vp
    337   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
    338   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    339   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
    340   ret <16 x i16> %res
    341 }
    342 
    343 define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
    344 ; GENERIC-LABEL: test_16xi16_perm_mem_mask3:
    345 ; GENERIC:       # %bb.0:
    346 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
    347 ; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    348 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    349 ;
    350 ; SKX-LABEL: test_16xi16_perm_mem_mask3:
    351 ; SKX:       # %bb.0:
    352 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
    353 ; SKX-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
    354 ; SKX-NEXT:    retq # sched: [7:1.00]
    355   %vec = load <16 x i16>, <16 x i16>* %vp
    356   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
    357   ret <16 x i16> %res
    358 }
    359 define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
    360 ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3:
    361 ; GENERIC:       # %bb.0:
    362 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
    363 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
    364 ; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
    365 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    366 ;
    367 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
    368 ; SKX:       # %bb.0:
    369 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
    370 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
    371 ; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
    372 ; SKX-NEXT:    retq # sched: [7:1.00]
    373   %vec = load <16 x i16>, <16 x i16>* %vp
    374   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
    375   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    376   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
    377   ret <16 x i16> %res
    378 }
    379 
    380 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
    381 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3:
    382 ; GENERIC:       # %bb.0:
    383 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
    384 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
    385 ; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
    386 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    387 ;
    388 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
    389 ; SKX:       # %bb.0:
    390 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
    391 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
    392 ; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
    393 ; SKX-NEXT:    retq # sched: [7:1.00]
    394   %vec = load <16 x i16>, <16 x i16>* %vp
    395   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
    396   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
    397   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
    398   ret <16 x i16> %res
    399 }
    400 
    401 define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
    402 ; GENERIC-LABEL: test_32xi16_perm_mask0:
    403 ; GENERIC:       # %bb.0:
    404 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
    405 ; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
    406 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    407 ;
    408 ; SKX-LABEL: test_32xi16_perm_mask0:
    409 ; SKX:       # %bb.0:
    410 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
    411 ; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
    412 ; SKX-NEXT:    retq # sched: [7:1.00]
    413   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
    414   ret <32 x i16> %res
    415 }
    416 define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
    417 ; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
    418 ; GENERIC:       # %bb.0:
    419 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
    420 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
    421 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
    422 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
    423 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    424 ;
    425 ; SKX-LABEL: test_masked_32xi16_perm_mask0:
    426 ; SKX:       # %bb.0:
    427 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
    428 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
    429 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
    430 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
    431 ; SKX-NEXT:    retq # sched: [7:1.00]
    432   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
    433   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    434   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
    435   ret <32 x i16> %res
    436 }
    437 
    438 define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
    439 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
    440 ; GENERIC:       # %bb.0:
    441 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
    442 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
    443 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
    444 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    445 ;
    446 ; SKX-LABEL: test_masked_z_32xi16_perm_mask0:
    447 ; SKX:       # %bb.0:
    448 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
    449 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
    450 ; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
    451 ; SKX-NEXT:    retq # sched: [7:1.00]
    452   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
    453   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    454   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
    455   ret <32 x i16> %res
    456 }
    457 define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
    458 ; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
    459 ; GENERIC:       # %bb.0:
    460 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50]
    461 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
    462 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
    463 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
    464 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    465 ;
    466 ; SKX-LABEL: test_masked_32xi16_perm_mask1:
    467 ; SKX:       # %bb.0:
    468 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
    469 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
    470 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
    471 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
    472 ; SKX-NEXT:    retq # sched: [7:1.00]
    473   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
    474   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    475   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
    476   ret <32 x i16> %res
    477 }
    478 
    479 define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
    480 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
    481 ; GENERIC:       # %bb.0:
    482 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50]
    483 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
    484 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
    485 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    486 ;
    487 ; SKX-LABEL: test_masked_z_32xi16_perm_mask1:
    488 ; SKX:       # %bb.0:
    489 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
    490 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
    491 ; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
    492 ; SKX-NEXT:    retq # sched: [7:1.00]
    493   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
    494   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    495   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
    496   ret <32 x i16> %res
    497 }
    498 define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
    499 ; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
    500 ; GENERIC:       # %bb.0:
    501 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50]
    502 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
    503 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
    504 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
    505 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    506 ;
    507 ; SKX-LABEL: test_masked_32xi16_perm_mask2:
    508 ; SKX:       # %bb.0:
    509 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
    510 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
    511 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
    512 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
    513 ; SKX-NEXT:    retq # sched: [7:1.00]
    514   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
    515   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    516   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
    517   ret <32 x i16> %res
    518 }
    519 
    520 define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
    521 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
    522 ; GENERIC:       # %bb.0:
    523 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50]
    524 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
    525 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
    526 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    527 ;
    528 ; SKX-LABEL: test_masked_z_32xi16_perm_mask2:
    529 ; SKX:       # %bb.0:
    530 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
    531 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
    532 ; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
    533 ; SKX-NEXT:    retq # sched: [7:1.00]
    534   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
    535   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    536   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
    537   ret <32 x i16> %res
    538 }
    539 define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
    540 ; GENERIC-LABEL: test_32xi16_perm_mask3:
    541 ; GENERIC:       # %bb.0:
    542 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
    543 ; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
    544 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    545 ;
    546 ; SKX-LABEL: test_32xi16_perm_mask3:
    547 ; SKX:       # %bb.0:
    548 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
    549 ; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
    550 ; SKX-NEXT:    retq # sched: [7:1.00]
    551   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
    552   ret <32 x i16> %res
    553 }
    554 define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
    555 ; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
    556 ; GENERIC:       # %bb.0:
    557 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
    558 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
    559 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
    560 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
    561 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    562 ;
    563 ; SKX-LABEL: test_masked_32xi16_perm_mask3:
    564 ; SKX:       # %bb.0:
    565 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
    566 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
    567 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
    568 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
    569 ; SKX-NEXT:    retq # sched: [7:1.00]
    570   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
    571   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    572   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
    573   ret <32 x i16> %res
    574 }
    575 
    576 define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
    577 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
    578 ; GENERIC:       # %bb.0:
    579 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
    580 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
    581 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
    582 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    583 ;
    584 ; SKX-LABEL: test_masked_z_32xi16_perm_mask3:
    585 ; SKX:       # %bb.0:
    586 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
    587 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
    588 ; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
    589 ; SKX-NEXT:    retq # sched: [7:1.00]
    590   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
    591   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    592   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
    593   ret <32 x i16> %res
    594 }
    595 define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
    596 ; GENERIC-LABEL: test_32xi16_perm_mem_mask0:
    597 ; GENERIC:       # %bb.0:
    598 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
    599 ; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
    600 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    601 ;
    602 ; SKX-LABEL: test_32xi16_perm_mem_mask0:
    603 ; SKX:       # %bb.0:
    604 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
    605 ; SKX-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
    606 ; SKX-NEXT:    retq # sched: [7:1.00]
    607   %vec = load <32 x i16>, <32 x i16>* %vp
    608   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
    609   ret <32 x i16> %res
    610 }
    611 define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
    612 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
    613 ; GENERIC:       # %bb.0:
    614 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
    615 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
    616 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
    617 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    618 ;
    619 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
    620 ; SKX:       # %bb.0:
    621 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
    622 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
    623 ; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
    624 ; SKX-NEXT:    retq # sched: [7:1.00]
    625   %vec = load <32 x i16>, <32 x i16>* %vp
    626   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
    627   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    628   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
    629   ret <32 x i16> %res
    630 }
    631 
    632 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
    633 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
    634 ; GENERIC:       # %bb.0:
    635 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
    636 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
    637 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
    638 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    639 ;
    640 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
    641 ; SKX:       # %bb.0:
    642 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
    643 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
    644 ; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
    645 ; SKX-NEXT:    retq # sched: [7:1.00]
    646   %vec = load <32 x i16>, <32 x i16>* %vp
    647   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
    648   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    649   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
    650   ret <32 x i16> %res
    651 }
    652 
    653 define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
    654 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
    655 ; GENERIC:       # %bb.0:
    656 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50]
    657 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
    658 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
    659 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    660 ;
    661 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
    662 ; SKX:       # %bb.0:
    663 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
    664 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
    665 ; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
    666 ; SKX-NEXT:    retq # sched: [7:1.00]
    667   %vec = load <32 x i16>, <32 x i16>* %vp
    668   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
    669   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    670   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
    671   ret <32 x i16> %res
    672 }
    673 
    674 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
    675 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
    676 ; GENERIC:       # %bb.0:
    677 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50]
    678 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
    679 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
    680 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    681 ;
    682 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
    683 ; SKX:       # %bb.0:
    684 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
    685 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
    686 ; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
    687 ; SKX-NEXT:    retq # sched: [7:1.00]
    688   %vec = load <32 x i16>, <32 x i16>* %vp
    689   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
    690   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    691   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
    692   ret <32 x i16> %res
    693 }
    694 
    695 define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
    696 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
    697 ; GENERIC:       # %bb.0:
    698 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50]
    699 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
    700 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
    701 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    702 ;
    703 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
    704 ; SKX:       # %bb.0:
    705 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
    706 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
    707 ; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
    708 ; SKX-NEXT:    retq # sched: [7:1.00]
    709   %vec = load <32 x i16>, <32 x i16>* %vp
    710   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
    711   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    712   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
    713   ret <32 x i16> %res
    714 }
    715 
    716 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
    717 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
    718 ; GENERIC:       # %bb.0:
    719 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50]
    720 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
    721 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
    722 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    723 ;
    724 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
    725 ; SKX:       # %bb.0:
    726 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
    727 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
    728 ; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
    729 ; SKX-NEXT:    retq # sched: [7:1.00]
    730   %vec = load <32 x i16>, <32 x i16>* %vp
    731   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
    732   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    733   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
    734   ret <32 x i16> %res
    735 }
    736 
    737 define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
    738 ; GENERIC-LABEL: test_32xi16_perm_mem_mask3:
    739 ; GENERIC:       # %bb.0:
    740 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
    741 ; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
    742 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    743 ;
    744 ; SKX-LABEL: test_32xi16_perm_mem_mask3:
    745 ; SKX:       # %bb.0:
    746 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
    747 ; SKX-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
    748 ; SKX-NEXT:    retq # sched: [7:1.00]
    749   %vec = load <32 x i16>, <32 x i16>* %vp
    750   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
    751   ret <32 x i16> %res
    752 }
    753 define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
    754 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
    755 ; GENERIC:       # %bb.0:
    756 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
    757 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
    758 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
    759 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    760 ;
    761 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
    762 ; SKX:       # %bb.0:
    763 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
    764 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
    765 ; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
    766 ; SKX-NEXT:    retq # sched: [7:1.00]
    767   %vec = load <32 x i16>, <32 x i16>* %vp
    768   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
    769   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    770   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
    771   ret <32 x i16> %res
    772 }
    773 
    774 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
    775 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
    776 ; GENERIC:       # %bb.0:
    777 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
    778 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
    779 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
    780 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    781 ;
    782 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
    783 ; SKX:       # %bb.0:
    784 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
    785 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
    786 ; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
    787 ; SKX-NEXT:    retq # sched: [7:1.00]
    788   %vec = load <32 x i16>, <32 x i16>* %vp
    789   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
    790   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
    791   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
    792   ret <32 x i16> %res
    793 }
    794 
    795 define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
    796 ; GENERIC-LABEL: test_8xi32_perm_mask0:
    797 ; GENERIC:       # %bb.0:
    798 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
    799 ; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
    800 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    801 ;
    802 ; SKX-LABEL: test_8xi32_perm_mask0:
    803 ; SKX:       # %bb.0:
    804 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
    805 ; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
    806 ; SKX-NEXT:    retq # sched: [7:1.00]
    807   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
    808   ret <8 x i32> %res
    809 }
    810 define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
    811 ; GENERIC-LABEL: test_masked_8xi32_perm_mask0:
    812 ; GENERIC:       # %bb.0:
    813 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
    814 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
    815 ; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
    816 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
    817 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    818 ;
    819 ; SKX-LABEL: test_masked_8xi32_perm_mask0:
    820 ; SKX:       # %bb.0:
    821 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
    822 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
    823 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
    824 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
    825 ; SKX-NEXT:    retq # sched: [7:1.00]
    826   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
    827   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
    828   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
    829   ret <8 x i32> %res
    830 }
    831 
    832 define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
    833 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0:
    834 ; GENERIC:       # %bb.0:
    835 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
    836 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
    837 ; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
    838 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    839 ;
    840 ; SKX-LABEL: test_masked_z_8xi32_perm_mask0:
    841 ; SKX:       # %bb.0:
    842 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
    843 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
    844 ; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
    845 ; SKX-NEXT:    retq # sched: [7:1.00]
    846   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
    847   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
    848   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
    849   ret <8 x i32> %res
    850 }
    851 define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
    852 ; GENERIC-LABEL: test_masked_8xi32_perm_mask1:
    853 ; GENERIC:       # %bb.0:
    854 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
    855 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
    856 ; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
    857 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
    858 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    859 ;
    860 ; SKX-LABEL: test_masked_8xi32_perm_mask1:
    861 ; SKX:       # %bb.0:
    862 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
    863 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
    864 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
    865 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
    866 ; SKX-NEXT:    retq # sched: [7:1.00]
    867   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
    868   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
    869   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
    870   ret <8 x i32> %res
    871 }
    872 
    873 define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
    874 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1:
    875 ; GENERIC:       # %bb.0:
    876 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
    877 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
    878 ; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
    879 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    880 ;
    881 ; SKX-LABEL: test_masked_z_8xi32_perm_mask1:
    882 ; SKX:       # %bb.0:
    883 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
    884 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
    885 ; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
    886 ; SKX-NEXT:    retq # sched: [7:1.00]
    887   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
    888   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
    889   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
    890   ret <8 x i32> %res
    891 }
    892 define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
    893 ; GENERIC-LABEL: test_masked_8xi32_perm_mask2:
    894 ; GENERIC:       # %bb.0:
    895 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
    896 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
    897 ; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
    898 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
    899 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    900 ;
    901 ; SKX-LABEL: test_masked_8xi32_perm_mask2:
    902 ; SKX:       # %bb.0:
    903 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
    904 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
    905 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
    906 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
    907 ; SKX-NEXT:    retq # sched: [7:1.00]
    908   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
    909   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
    910   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
    911   ret <8 x i32> %res
    912 }
    913 
    914 define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
    915 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2:
    916 ; GENERIC:       # %bb.0:
    917 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
    918 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
    919 ; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
    920 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    921 ;
    922 ; SKX-LABEL: test_masked_z_8xi32_perm_mask2:
    923 ; SKX:       # %bb.0:
    924 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
    925 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
    926 ; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
    927 ; SKX-NEXT:    retq # sched: [7:1.00]
    928   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
    929   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
    930   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
    931   ret <8 x i32> %res
    932 }
    933 define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
    934 ; GENERIC-LABEL: test_8xi32_perm_mask3:
    935 ; GENERIC:       # %bb.0:
    936 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
    937 ; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
    938 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    939 ;
    940 ; SKX-LABEL: test_8xi32_perm_mask3:
    941 ; SKX:       # %bb.0:
    942 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
    943 ; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
    944 ; SKX-NEXT:    retq # sched: [7:1.00]
    945   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
    946   ret <8 x i32> %res
    947 }
    948 define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
    949 ; GENERIC-LABEL: test_masked_8xi32_perm_mask3:
    950 ; GENERIC:       # %bb.0:
    951 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
    952 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
    953 ; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
    954 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
    955 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    956 ;
    957 ; SKX-LABEL: test_masked_8xi32_perm_mask3:
    958 ; SKX:       # %bb.0:
    959 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
    960 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
    961 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
    962 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
    963 ; SKX-NEXT:    retq # sched: [7:1.00]
    964   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
    965   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
    966   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
    967   ret <8 x i32> %res
    968 }
    969 
    970 define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
    971 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3:
    972 ; GENERIC:       # %bb.0:
    973 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
    974 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
    975 ; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
    976 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    977 ;
    978 ; SKX-LABEL: test_masked_z_8xi32_perm_mask3:
    979 ; SKX:       # %bb.0:
    980 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
    981 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
    982 ; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
    983 ; SKX-NEXT:    retq # sched: [7:1.00]
    984   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
    985   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
    986   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
    987   ret <8 x i32> %res
    988 }
    989 define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
    990 ; GENERIC-LABEL: test_8xi32_perm_mem_mask0:
    991 ; GENERIC:       # %bb.0:
    992 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
    993 ; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    994 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    995 ;
    996 ; SKX-LABEL: test_8xi32_perm_mem_mask0:
    997 ; SKX:       # %bb.0:
    998 ; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
    999 ; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   1000 ; SKX-NEXT:    retq # sched: [7:1.00]
   1001   %vec = load <8 x i32>, <8 x i32>* %vp
   1002   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   1003   ret <8 x i32> %res
   1004 }
   1005 define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   1006 ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0:
   1007 ; GENERIC:       # %bb.0:
   1008 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
   1009 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1010 ; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
   1011 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1012 ;
   1013 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
   1014 ; SKX:       # %bb.0:
   1015 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
   1016 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1017 ; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
   1018 ; SKX-NEXT:    retq # sched: [7:1.00]
   1019   %vec = load <8 x i32>, <8 x i32>* %vp
   1020   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   1021   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1022   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   1023   ret <8 x i32> %res
   1024 }
   1025 
   1026 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
   1027 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0:
   1028 ; GENERIC:       # %bb.0:
   1029 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
   1030 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   1031 ; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
   1032 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1033 ;
   1034 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
   1035 ; SKX:       # %bb.0:
   1036 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
   1037 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   1038 ; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
   1039 ; SKX-NEXT:    retq # sched: [7:1.00]
   1040   %vec = load <8 x i32>, <8 x i32>* %vp
   1041   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   1042   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1043   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1044   ret <8 x i32> %res
   1045 }
   1046 
   1047 define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   1048 ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1:
   1049 ; GENERIC:       # %bb.0:
   1050 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
   1051 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1052 ; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
   1053 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1054 ;
   1055 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
   1056 ; SKX:       # %bb.0:
   1057 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
   1058 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1059 ; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
   1060 ; SKX-NEXT:    retq # sched: [7:1.00]
   1061   %vec = load <8 x i32>, <8 x i32>* %vp
   1062   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
   1063   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1064   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   1065   ret <8 x i32> %res
   1066 }
   1067 
   1068 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
   1069 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1:
   1070 ; GENERIC:       # %bb.0:
   1071 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
   1072 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   1073 ; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
   1074 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1075 ;
   1076 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
   1077 ; SKX:       # %bb.0:
   1078 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
   1079 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   1080 ; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
   1081 ; SKX-NEXT:    retq # sched: [7:1.00]
   1082   %vec = load <8 x i32>, <8 x i32>* %vp
   1083   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
   1084   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1085   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1086   ret <8 x i32> %res
   1087 }
   1088 
   1089 define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   1090 ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2:
   1091 ; GENERIC:       # %bb.0:
   1092 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
   1093 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1094 ; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
   1095 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1096 ;
   1097 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
   1098 ; SKX:       # %bb.0:
   1099 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
   1100 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1101 ; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
   1102 ; SKX-NEXT:    retq # sched: [7:1.00]
   1103   %vec = load <8 x i32>, <8 x i32>* %vp
   1104   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
   1105   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1106   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   1107   ret <8 x i32> %res
   1108 }
   1109 
   1110 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
   1111 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2:
   1112 ; GENERIC:       # %bb.0:
   1113 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
   1114 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   1115 ; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
   1116 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1117 ;
   1118 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
   1119 ; SKX:       # %bb.0:
   1120 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
   1121 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   1122 ; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
   1123 ; SKX-NEXT:    retq # sched: [7:1.00]
   1124   %vec = load <8 x i32>, <8 x i32>* %vp
   1125   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
   1126   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1127   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1128   ret <8 x i32> %res
   1129 }
   1130 
   1131 define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
   1132 ; GENERIC-LABEL: test_8xi32_perm_mem_mask3:
   1133 ; GENERIC:       # %bb.0:
   1134 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
   1135 ; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   1136 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1137 ;
   1138 ; SKX-LABEL: test_8xi32_perm_mem_mask3:
   1139 ; SKX:       # %bb.0:
   1140 ; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
   1141 ; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   1142 ; SKX-NEXT:    retq # sched: [7:1.00]
   1143   %vec = load <8 x i32>, <8 x i32>* %vp
   1144   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   1145   ret <8 x i32> %res
   1146 }
   1147 define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   1148 ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3:
   1149 ; GENERIC:       # %bb.0:
   1150 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
   1151 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1152 ; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
   1153 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1154 ;
   1155 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
   1156 ; SKX:       # %bb.0:
   1157 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
   1158 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1159 ; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
   1160 ; SKX-NEXT:    retq # sched: [7:1.00]
   1161   %vec = load <8 x i32>, <8 x i32>* %vp
   1162   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   1163   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1164   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   1165   ret <8 x i32> %res
   1166 }
   1167 
   1168 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
   1169 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3:
   1170 ; GENERIC:       # %bb.0:
   1171 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
   1172 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   1173 ; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
   1174 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1175 ;
   1176 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
   1177 ; SKX:       # %bb.0:
   1178 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
   1179 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   1180 ; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
   1181 ; SKX-NEXT:    retq # sched: [7:1.00]
   1182   %vec = load <8 x i32>, <8 x i32>* %vp
   1183   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   1184   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   1185   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   1186   ret <8 x i32> %res
   1187 }
   1188 
   1189 define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
   1190 ; GENERIC-LABEL: test_16xi32_perm_mask0:
   1191 ; GENERIC:       # %bb.0:
   1192 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
   1193 ; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
   1194 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1195 ;
   1196 ; SKX-LABEL: test_16xi32_perm_mask0:
   1197 ; SKX:       # %bb.0:
   1198 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
   1199 ; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
   1200 ; SKX-NEXT:    retq # sched: [7:1.00]
   1201   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   1202   ret <16 x i32> %res
   1203 }
   1204 define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   1205 ; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
   1206 ; GENERIC:       # %bb.0:
   1207 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
   1208 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   1209 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   1210 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   1211 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1212 ;
   1213 ; SKX-LABEL: test_masked_16xi32_perm_mask0:
   1214 ; SKX:       # %bb.0:
   1215 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
   1216 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   1217 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   1218 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   1219 ; SKX-NEXT:    retq # sched: [7:1.00]
   1220   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   1221   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1222   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   1223   ret <16 x i32> %res
   1224 }
   1225 
   1226 define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
   1227 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
   1228 ; GENERIC:       # %bb.0:
   1229 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
   1230 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1231 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   1232 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1233 ;
   1234 ; SKX-LABEL: test_masked_z_16xi32_perm_mask0:
   1235 ; SKX:       # %bb.0:
   1236 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
   1237 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1238 ; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   1239 ; SKX-NEXT:    retq # sched: [7:1.00]
   1240   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   1241   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1242   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1243   ret <16 x i32> %res
   1244 }
   1245 define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   1246 ; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
   1247 ; GENERIC:       # %bb.0:
   1248 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50]
   1249 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   1250 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   1251 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   1252 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1253 ;
   1254 ; SKX-LABEL: test_masked_16xi32_perm_mask1:
   1255 ; SKX:       # %bb.0:
   1256 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
   1257 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   1258 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   1259 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   1260 ; SKX-NEXT:    retq # sched: [7:1.00]
   1261   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
   1262   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1263   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   1264   ret <16 x i32> %res
   1265 }
   1266 
   1267 define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
   1268 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
   1269 ; GENERIC:       # %bb.0:
   1270 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50]
   1271 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1272 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   1273 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1274 ;
   1275 ; SKX-LABEL: test_masked_z_16xi32_perm_mask1:
   1276 ; SKX:       # %bb.0:
   1277 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
   1278 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1279 ; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   1280 ; SKX-NEXT:    retq # sched: [7:1.00]
   1281   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
   1282   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1283   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1284   ret <16 x i32> %res
   1285 }
   1286 define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   1287 ; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
   1288 ; GENERIC:       # %bb.0:
   1289 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50]
   1290 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   1291 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   1292 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   1293 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1294 ;
   1295 ; SKX-LABEL: test_masked_16xi32_perm_mask2:
   1296 ; SKX:       # %bb.0:
   1297 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
   1298 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   1299 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   1300 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   1301 ; SKX-NEXT:    retq # sched: [7:1.00]
   1302   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
   1303   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1304   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   1305   ret <16 x i32> %res
   1306 }
   1307 
   1308 define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
   1309 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
   1310 ; GENERIC:       # %bb.0:
   1311 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50]
   1312 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1313 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   1314 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1315 ;
   1316 ; SKX-LABEL: test_masked_z_16xi32_perm_mask2:
   1317 ; SKX:       # %bb.0:
   1318 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
   1319 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1320 ; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   1321 ; SKX-NEXT:    retq # sched: [7:1.00]
   1322   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
   1323   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1324   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1325   ret <16 x i32> %res
   1326 }
   1327 define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
   1328 ; GENERIC-LABEL: test_16xi32_perm_mask3:
   1329 ; GENERIC:       # %bb.0:
   1330 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
   1331 ; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
   1332 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1333 ;
   1334 ; SKX-LABEL: test_16xi32_perm_mask3:
   1335 ; SKX:       # %bb.0:
   1336 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
   1337 ; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
   1338 ; SKX-NEXT:    retq # sched: [7:1.00]
   1339   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   1340   ret <16 x i32> %res
   1341 }
   1342 define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   1343 ; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
   1344 ; GENERIC:       # %bb.0:
   1345 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
   1346 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   1347 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   1348 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   1349 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1350 ;
   1351 ; SKX-LABEL: test_masked_16xi32_perm_mask3:
   1352 ; SKX:       # %bb.0:
   1353 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
   1354 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   1355 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   1356 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   1357 ; SKX-NEXT:    retq # sched: [7:1.00]
   1358   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   1359   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1360   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   1361   ret <16 x i32> %res
   1362 }
   1363 
   1364 define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
   1365 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
   1366 ; GENERIC:       # %bb.0:
   1367 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
   1368 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1369 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   1370 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1371 ;
   1372 ; SKX-LABEL: test_masked_z_16xi32_perm_mask3:
   1373 ; SKX:       # %bb.0:
   1374 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
   1375 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1376 ; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   1377 ; SKX-NEXT:    retq # sched: [7:1.00]
   1378   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   1379   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1380   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1381   ret <16 x i32> %res
   1382 }
   1383 define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
   1384 ; GENERIC-LABEL: test_16xi32_perm_mem_mask0:
   1385 ; GENERIC:       # %bb.0:
   1386 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
   1387 ; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
   1388 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1389 ;
   1390 ; SKX-LABEL: test_16xi32_perm_mem_mask0:
   1391 ; SKX:       # %bb.0:
   1392 ; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
   1393 ; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
   1394 ; SKX-NEXT:    retq # sched: [7:1.00]
   1395   %vec = load <16 x i32>, <16 x i32>* %vp
   1396   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   1397   ret <16 x i32> %res
   1398 }
   1399 define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   1400 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
   1401 ; GENERIC:       # %bb.0:
   1402 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
   1403 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1404 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   1405 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1406 ;
   1407 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
   1408 ; SKX:       # %bb.0:
   1409 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
   1410 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1411 ; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   1412 ; SKX-NEXT:    retq # sched: [7:1.00]
   1413   %vec = load <16 x i32>, <16 x i32>* %vp
   1414   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   1415   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1416   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   1417   ret <16 x i32> %res
   1418 }
   1419 
   1420 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
   1421 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
   1422 ; GENERIC:       # %bb.0:
   1423 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
   1424 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   1425 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   1426 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1427 ;
   1428 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
   1429 ; SKX:       # %bb.0:
   1430 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
   1431 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   1432 ; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   1433 ; SKX-NEXT:    retq # sched: [7:1.00]
   1434   %vec = load <16 x i32>, <16 x i32>* %vp
   1435   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   1436   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1437   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1438   ret <16 x i32> %res
   1439 }
   1440 
   1441 define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   1442 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
   1443 ; GENERIC:       # %bb.0:
   1444 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50]
   1445 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1446 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   1447 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1448 ;
   1449 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
   1450 ; SKX:       # %bb.0:
   1451 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
   1452 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1453 ; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   1454 ; SKX-NEXT:    retq # sched: [7:1.00]
   1455   %vec = load <16 x i32>, <16 x i32>* %vp
   1456   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
   1457   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1458   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   1459   ret <16 x i32> %res
   1460 }
   1461 
   1462 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
   1463 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
   1464 ; GENERIC:       # %bb.0:
   1465 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50]
   1466 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   1467 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   1468 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1469 ;
   1470 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
   1471 ; SKX:       # %bb.0:
   1472 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
   1473 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   1474 ; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   1475 ; SKX-NEXT:    retq # sched: [7:1.00]
   1476   %vec = load <16 x i32>, <16 x i32>* %vp
   1477   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
   1478   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1479   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1480   ret <16 x i32> %res
   1481 }
   1482 
   1483 define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   1484 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
   1485 ; GENERIC:       # %bb.0:
   1486 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50]
   1487 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1488 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   1489 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1490 ;
   1491 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
   1492 ; SKX:       # %bb.0:
   1493 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
   1494 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1495 ; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   1496 ; SKX-NEXT:    retq # sched: [7:1.00]
   1497   %vec = load <16 x i32>, <16 x i32>* %vp
   1498   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
   1499   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1500   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   1501   ret <16 x i32> %res
   1502 }
   1503 
   1504 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
   1505 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
   1506 ; GENERIC:       # %bb.0:
   1507 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50]
   1508 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   1509 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   1510 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1511 ;
   1512 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
   1513 ; SKX:       # %bb.0:
   1514 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
   1515 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   1516 ; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   1517 ; SKX-NEXT:    retq # sched: [7:1.00]
   1518   %vec = load <16 x i32>, <16 x i32>* %vp
   1519   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
   1520   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1521   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1522   ret <16 x i32> %res
   1523 }
   1524 
   1525 define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
   1526 ; GENERIC-LABEL: test_16xi32_perm_mem_mask3:
   1527 ; GENERIC:       # %bb.0:
   1528 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
   1529 ; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
   1530 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1531 ;
   1532 ; SKX-LABEL: test_16xi32_perm_mem_mask3:
   1533 ; SKX:       # %bb.0:
   1534 ; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
   1535 ; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
   1536 ; SKX-NEXT:    retq # sched: [7:1.00]
   1537   %vec = load <16 x i32>, <16 x i32>* %vp
   1538   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   1539   ret <16 x i32> %res
   1540 }
   1541 define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   1542 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
   1543 ; GENERIC:       # %bb.0:
   1544 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
   1545 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1546 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   1547 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1548 ;
   1549 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
   1550 ; SKX:       # %bb.0:
   1551 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
   1552 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1553 ; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   1554 ; SKX-NEXT:    retq # sched: [7:1.00]
   1555   %vec = load <16 x i32>, <16 x i32>* %vp
   1556   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   1557   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1558   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   1559   ret <16 x i32> %res
   1560 }
   1561 
   1562 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
   1563 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
   1564 ; GENERIC:       # %bb.0:
   1565 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
   1566 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   1567 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   1568 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1569 ;
   1570 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
   1571 ; SKX:       # %bb.0:
   1572 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
   1573 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   1574 ; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   1575 ; SKX-NEXT:    retq # sched: [7:1.00]
   1576   %vec = load <16 x i32>, <16 x i32>* %vp
   1577   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   1578   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   1579   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   1580   ret <16 x i32> %res
   1581 }
   1582 
   1583 define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
   1584 ; GENERIC-LABEL: test_4xi64_perm_mask0:
   1585 ; GENERIC:       # %bb.0:
   1586 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [1:1.00]
   1587 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1588 ;
   1589 ; SKX-LABEL: test_4xi64_perm_mask0:
   1590 ; SKX:       # %bb.0:
   1591 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00]
   1592 ; SKX-NEXT:    retq # sched: [7:1.00]
   1593   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   1594   ret <4 x i64> %res
   1595 }
   1596 define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
   1597 ; GENERIC-LABEL: test_masked_4xi64_perm_mask0:
   1598 ; GENERIC:       # %bb.0:
   1599 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   1600 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00]
   1601 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   1602 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1603 ;
   1604 ; SKX-LABEL: test_masked_4xi64_perm_mask0:
   1605 ; SKX:       # %bb.0:
   1606 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   1607 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
   1608 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   1609 ; SKX-NEXT:    retq # sched: [7:1.00]
   1610   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   1611   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1612   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   1613   ret <4 x i64> %res
   1614 }
   1615 
   1616 define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
   1617 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0:
   1618 ; GENERIC:       # %bb.0:
   1619 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1620 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00]
   1621 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1622 ;
   1623 ; SKX-LABEL: test_masked_z_4xi64_perm_mask0:
   1624 ; SKX:       # %bb.0:
   1625 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1626 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
   1627 ; SKX-NEXT:    retq # sched: [7:1.00]
   1628   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   1629   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1630   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1631   ret <4 x i64> %res
   1632 }
   1633 define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
   1634 ; GENERIC-LABEL: test_masked_4xi64_perm_mask1:
   1635 ; GENERIC:       # %bb.0:
   1636 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   1637 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00]
   1638 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   1639 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1640 ;
   1641 ; SKX-LABEL: test_masked_4xi64_perm_mask1:
   1642 ; SKX:       # %bb.0:
   1643 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   1644 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
   1645 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   1646 ; SKX-NEXT:    retq # sched: [7:1.00]
   1647   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   1648   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1649   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   1650   ret <4 x i64> %res
   1651 }
   1652 
   1653 define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
   1654 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1:
   1655 ; GENERIC:       # %bb.0:
   1656 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1657 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00]
   1658 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1659 ;
   1660 ; SKX-LABEL: test_masked_z_4xi64_perm_mask1:
   1661 ; SKX:       # %bb.0:
   1662 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1663 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
   1664 ; SKX-NEXT:    retq # sched: [7:1.00]
   1665   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   1666   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1667   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1668   ret <4 x i64> %res
   1669 }
   1670 define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
   1671 ; GENERIC-LABEL: test_masked_4xi64_perm_mask2:
   1672 ; GENERIC:       # %bb.0:
   1673 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   1674 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00]
   1675 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   1676 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1677 ;
   1678 ; SKX-LABEL: test_masked_4xi64_perm_mask2:
   1679 ; SKX:       # %bb.0:
   1680 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   1681 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
   1682 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   1683 ; SKX-NEXT:    retq # sched: [7:1.00]
   1684   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
   1685   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1686   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   1687   ret <4 x i64> %res
   1688 }
   1689 
   1690 define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
   1691 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2:
   1692 ; GENERIC:       # %bb.0:
   1693 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1694 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00]
   1695 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1696 ;
   1697 ; SKX-LABEL: test_masked_z_4xi64_perm_mask2:
   1698 ; SKX:       # %bb.0:
   1699 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1700 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
   1701 ; SKX-NEXT:    retq # sched: [7:1.00]
   1702   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
   1703   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1704   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1705   ret <4 x i64> %res
   1706 }
   1707 define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
   1708 ; GENERIC-LABEL: test_4xi64_perm_mask3:
   1709 ; GENERIC:       # %bb.0:
   1710 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [1:1.00]
   1711 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1712 ;
   1713 ; SKX-LABEL: test_4xi64_perm_mask3:
   1714 ; SKX:       # %bb.0:
   1715 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00]
   1716 ; SKX-NEXT:    retq # sched: [7:1.00]
   1717   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   1718   ret <4 x i64> %res
   1719 }
   1720 define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
   1721 ; GENERIC-LABEL: test_masked_4xi64_perm_mask3:
   1722 ; GENERIC:       # %bb.0:
   1723 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   1724 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00]
   1725 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   1726 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1727 ;
   1728 ; SKX-LABEL: test_masked_4xi64_perm_mask3:
   1729 ; SKX:       # %bb.0:
   1730 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   1731 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
   1732 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   1733 ; SKX-NEXT:    retq # sched: [7:1.00]
   1734   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   1735   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1736   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   1737   ret <4 x i64> %res
   1738 }
   1739 
   1740 define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
   1741 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3:
   1742 ; GENERIC:       # %bb.0:
   1743 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1744 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00]
   1745 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1746 ;
   1747 ; SKX-LABEL: test_masked_z_4xi64_perm_mask3:
   1748 ; SKX:       # %bb.0:
   1749 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1750 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
   1751 ; SKX-NEXT:    retq # sched: [7:1.00]
   1752   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   1753   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1754   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1755   ret <4 x i64> %res
   1756 }
   1757 define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
   1758 ; GENERIC-LABEL: test_4xi64_perm_mem_mask0:
   1759 ; GENERIC:       # %bb.0:
   1760 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [8:1.00]
   1761 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1762 ;
   1763 ; SKX-LABEL: test_4xi64_perm_mem_mask0:
   1764 ; SKX:       # %bb.0:
   1765 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00]
   1766 ; SKX-NEXT:    retq # sched: [7:1.00]
   1767   %vec = load <4 x i64>, <4 x i64>* %vp
   1768   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   1769   ret <4 x i64> %res
   1770 }
   1771 define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
   1772 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
   1773 ; GENERIC:       # %bb.0:
   1774 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1775 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [8:1.00]
   1776 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1777 ;
   1778 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
   1779 ; SKX:       # %bb.0:
   1780 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1781 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
   1782 ; SKX-NEXT:    retq # sched: [7:1.00]
   1783   %vec = load <4 x i64>, <4 x i64>* %vp
   1784   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   1785   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1786   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   1787   ret <4 x i64> %res
   1788 }
   1789 
   1790 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
   1791 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
   1792 ; GENERIC:       # %bb.0:
   1793 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
   1794 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [8:1.00]
   1795 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1796 ;
   1797 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
   1798 ; SKX:       # %bb.0:
   1799 ; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
   1800 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
   1801 ; SKX-NEXT:    retq # sched: [7:1.00]
   1802   %vec = load <4 x i64>, <4 x i64>* %vp
   1803   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   1804   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1805   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1806   ret <4 x i64> %res
   1807 }
   1808 
   1809 define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
   1810 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
   1811 ; GENERIC:       # %bb.0:
   1812 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1813 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [8:1.00]
   1814 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1815 ;
   1816 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
   1817 ; SKX:       # %bb.0:
   1818 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1819 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
   1820 ; SKX-NEXT:    retq # sched: [7:1.00]
   1821   %vec = load <4 x i64>, <4 x i64>* %vp
   1822   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   1823   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1824   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   1825   ret <4 x i64> %res
   1826 }
   1827 
   1828 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
   1829 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
   1830 ; GENERIC:       # %bb.0:
   1831 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
   1832 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [8:1.00]
   1833 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1834 ;
   1835 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
   1836 ; SKX:       # %bb.0:
   1837 ; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
   1838 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
   1839 ; SKX-NEXT:    retq # sched: [7:1.00]
   1840   %vec = load <4 x i64>, <4 x i64>* %vp
   1841   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   1842   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1843   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1844   ret <4 x i64> %res
   1845 }
   1846 
   1847 define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
   1848 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
   1849 ; GENERIC:       # %bb.0:
   1850 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1851 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [8:1.00]
   1852 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1853 ;
   1854 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
   1855 ; SKX:       # %bb.0:
   1856 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1857 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
   1858 ; SKX-NEXT:    retq # sched: [7:1.00]
   1859   %vec = load <4 x i64>, <4 x i64>* %vp
   1860   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
   1861   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1862   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   1863   ret <4 x i64> %res
   1864 }
   1865 
   1866 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
   1867 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
   1868 ; GENERIC:       # %bb.0:
   1869 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
   1870 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [8:1.00]
   1871 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1872 ;
   1873 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
   1874 ; SKX:       # %bb.0:
   1875 ; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
   1876 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
   1877 ; SKX-NEXT:    retq # sched: [7:1.00]
   1878   %vec = load <4 x i64>, <4 x i64>* %vp
   1879   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
   1880   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1881   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1882   ret <4 x i64> %res
   1883 }
   1884 
   1885 define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
   1886 ; GENERIC-LABEL: test_4xi64_perm_mem_mask3:
   1887 ; GENERIC:       # %bb.0:
   1888 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [8:1.00]
   1889 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1890 ;
   1891 ; SKX-LABEL: test_4xi64_perm_mem_mask3:
   1892 ; SKX:       # %bb.0:
   1893 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00]
   1894 ; SKX-NEXT:    retq # sched: [7:1.00]
   1895   %vec = load <4 x i64>, <4 x i64>* %vp
   1896   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   1897   ret <4 x i64> %res
   1898 }
   1899 define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
   1900 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
   1901 ; GENERIC:       # %bb.0:
   1902 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   1903 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [8:1.00]
   1904 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1905 ;
   1906 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
   1907 ; SKX:       # %bb.0:
   1908 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   1909 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
   1910 ; SKX-NEXT:    retq # sched: [7:1.00]
   1911   %vec = load <4 x i64>, <4 x i64>* %vp
   1912   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   1913   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1914   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   1915   ret <4 x i64> %res
   1916 }
   1917 
   1918 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
   1919 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
   1920 ; GENERIC:       # %bb.0:
   1921 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
   1922 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [8:1.00]
   1923 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1924 ;
   1925 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
   1926 ; SKX:       # %bb.0:
   1927 ; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
   1928 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
   1929 ; SKX-NEXT:    retq # sched: [7:1.00]
   1930   %vec = load <4 x i64>, <4 x i64>* %vp
   1931   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   1932   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   1933   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   1934   ret <4 x i64> %res
   1935 }
   1936 
   1937 define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
   1938 ; GENERIC-LABEL: test_8xi64_perm_mask0:
   1939 ; GENERIC:       # %bb.0:
   1940 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
   1941 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
   1942 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1943 ;
   1944 ; SKX-LABEL: test_8xi64_perm_mask0:
   1945 ; SKX:       # %bb.0:
   1946 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
   1947 ; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
   1948 ; SKX-NEXT:    retq # sched: [7:1.00]
   1949   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   1950   ret <8 x i64> %res
   1951 }
   1952 define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
   1953 ; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
   1954 ; GENERIC:       # %bb.0:
   1955 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
   1956 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   1957 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   1958 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   1959 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1960 ;
   1961 ; SKX-LABEL: test_masked_8xi64_perm_mask0:
   1962 ; SKX:       # %bb.0:
   1963 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
   1964 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   1965 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   1966 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   1967 ; SKX-NEXT:    retq # sched: [7:1.00]
   1968   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   1969   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1970   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   1971   ret <8 x i64> %res
   1972 }
   1973 
   1974 define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
   1975 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
   1976 ; GENERIC:       # %bb.0:
   1977 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
   1978 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   1979 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   1980 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1981 ;
   1982 ; SKX-LABEL: test_masked_z_8xi64_perm_mask0:
   1983 ; SKX:       # %bb.0:
   1984 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
   1985 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   1986 ; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   1987 ; SKX-NEXT:    retq # sched: [7:1.00]
   1988   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   1989   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   1990   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   1991   ret <8 x i64> %res
   1992 }
   1993 define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
   1994 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1:
   1995 ; GENERIC:       # %bb.0:
   1996 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   1997 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
   1998 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   1999 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2000 ;
   2001 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
   2002 ; SKX:       # %bb.0:
   2003 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   2004 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
   2005 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   2006 ; SKX-NEXT:    retq # sched: [7:1.00]
   2007   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
   2008   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2009   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2010   ret <8 x i64> %res
   2011 }
   2012 
   2013 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
   2014 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1:
   2015 ; GENERIC:       # %bb.0:
   2016 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2017 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
   2018 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2019 ;
   2020 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1:
   2021 ; SKX:       # %bb.0:
   2022 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2023 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
   2024 ; SKX-NEXT:    retq # sched: [7:1.00]
   2025   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
   2026   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2027   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2028   ret <8 x i64> %res
   2029 }
   2030 define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
   2031 ; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
   2032 ; GENERIC:       # %bb.0:
   2033 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [7:0.50]
   2034 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   2035 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   2036 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   2037 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2038 ;
   2039 ; SKX-LABEL: test_masked_8xi64_perm_mask2:
   2040 ; SKX:       # %bb.0:
   2041 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
   2042 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   2043 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   2044 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   2045 ; SKX-NEXT:    retq # sched: [7:1.00]
   2046   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
   2047   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2048   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2049   ret <8 x i64> %res
   2050 }
   2051 
   2052 define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
   2053 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
   2054 ; GENERIC:       # %bb.0:
   2055 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [7:0.50]
   2056 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2057 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   2058 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2059 ;
   2060 ; SKX-LABEL: test_masked_z_8xi64_perm_mask2:
   2061 ; SKX:       # %bb.0:
   2062 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
   2063 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2064 ; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   2065 ; SKX-NEXT:    retq # sched: [7:1.00]
   2066   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
   2067   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2068   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2069   ret <8 x i64> %res
   2070 }
   2071 define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
   2072 ; GENERIC-LABEL: test_8xi64_perm_imm_mask3:
   2073 ; GENERIC:       # %bb.0:
   2074 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
   2075 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2076 ;
   2077 ; SKX-LABEL: test_8xi64_perm_imm_mask3:
   2078 ; SKX:       # %bb.0:
   2079 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
   2080 ; SKX-NEXT:    retq # sched: [7:1.00]
   2081   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   2082   ret <8 x i64> %res
   2083 }
   2084 define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
   2085 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3:
   2086 ; GENERIC:       # %bb.0:
   2087 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   2088 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
   2089 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   2090 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2091 ;
   2092 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
   2093 ; SKX:       # %bb.0:
   2094 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   2095 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
   2096 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   2097 ; SKX-NEXT:    retq # sched: [7:1.00]
   2098   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   2099   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2100   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2101   ret <8 x i64> %res
   2102 }
   2103 
   2104 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
   2105 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3:
   2106 ; GENERIC:       # %bb.0:
   2107 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2108 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
   2109 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2110 ;
   2111 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3:
   2112 ; SKX:       # %bb.0:
   2113 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2114 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
   2115 ; SKX-NEXT:    retq # sched: [7:1.00]
   2116   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   2117   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2118   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2119   ret <8 x i64> %res
   2120 }
   2121 define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
   2122 ; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
   2123 ; GENERIC:       # %bb.0:
   2124 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [7:0.50]
   2125 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   2126 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   2127 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   2128 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2129 ;
   2130 ; SKX-LABEL: test_masked_8xi64_perm_mask4:
   2131 ; SKX:       # %bb.0:
   2132 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
   2133 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   2134 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   2135 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   2136 ; SKX-NEXT:    retq # sched: [7:1.00]
   2137   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
   2138   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2139   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2140   ret <8 x i64> %res
   2141 }
   2142 
   2143 define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
   2144 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
   2145 ; GENERIC:       # %bb.0:
   2146 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [7:0.50]
   2147 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2148 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   2149 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2150 ;
   2151 ; SKX-LABEL: test_masked_z_8xi64_perm_mask4:
   2152 ; SKX:       # %bb.0:
   2153 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
   2154 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2155 ; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   2156 ; SKX-NEXT:    retq # sched: [7:1.00]
   2157   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
   2158   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2159   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2160   ret <8 x i64> %res
   2161 }
   2162 define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
   2163 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5:
   2164 ; GENERIC:       # %bb.0:
   2165 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   2166 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
   2167 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   2168 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2169 ;
   2170 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
   2171 ; SKX:       # %bb.0:
   2172 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   2173 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
   2174 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   2175 ; SKX-NEXT:    retq # sched: [7:1.00]
   2176   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   2177   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2178   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2179   ret <8 x i64> %res
   2180 }
   2181 
   2182 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
   2183 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5:
   2184 ; GENERIC:       # %bb.0:
   2185 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2186 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
   2187 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2188 ;
   2189 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5:
   2190 ; SKX:       # %bb.0:
   2191 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2192 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
   2193 ; SKX-NEXT:    retq # sched: [7:1.00]
   2194   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   2195   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2196   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2197   ret <8 x i64> %res
   2198 }
   2199 define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
   2200 ; GENERIC-LABEL: test_8xi64_perm_mask6:
   2201 ; GENERIC:       # %bb.0:
   2202 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
   2203 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
   2204 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2205 ;
   2206 ; SKX-LABEL: test_8xi64_perm_mask6:
   2207 ; SKX:       # %bb.0:
   2208 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
   2209 ; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
   2210 ; SKX-NEXT:    retq # sched: [7:1.00]
   2211   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   2212   ret <8 x i64> %res
   2213 }
   2214 define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
   2215 ; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
   2216 ; GENERIC:       # %bb.0:
   2217 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
   2218 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   2219 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   2220 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   2221 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2222 ;
   2223 ; SKX-LABEL: test_masked_8xi64_perm_mask6:
   2224 ; SKX:       # %bb.0:
   2225 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
   2226 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   2227 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   2228 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   2229 ; SKX-NEXT:    retq # sched: [7:1.00]
   2230   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   2231   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2232   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2233   ret <8 x i64> %res
   2234 }
   2235 
   2236 define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
   2237 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
   2238 ; GENERIC:       # %bb.0:
   2239 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
   2240 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2241 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   2242 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2243 ;
   2244 ; SKX-LABEL: test_masked_z_8xi64_perm_mask6:
   2245 ; SKX:       # %bb.0:
   2246 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
   2247 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2248 ; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   2249 ; SKX-NEXT:    retq # sched: [7:1.00]
   2250   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   2251   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2252   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2253   ret <8 x i64> %res
   2254 }
   2255 define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
   2256 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7:
   2257 ; GENERIC:       # %bb.0:
   2258 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   2259 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
   2260 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   2261 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2262 ;
   2263 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
   2264 ; SKX:       # %bb.0:
   2265 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   2266 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
   2267 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   2268 ; SKX-NEXT:    retq # sched: [7:1.00]
   2269   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
   2270   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2271   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2272   ret <8 x i64> %res
   2273 }
   2274 
   2275 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
   2276 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7:
   2277 ; GENERIC:       # %bb.0:
   2278 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2279 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
   2280 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2281 ;
   2282 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7:
   2283 ; SKX:       # %bb.0:
   2284 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2285 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
   2286 ; SKX-NEXT:    retq # sched: [7:1.00]
   2287   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
   2288   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2289   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2290   ret <8 x i64> %res
   2291 }
   2292 define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
   2293 ; GENERIC-LABEL: test_8xi64_perm_mem_mask0:
   2294 ; GENERIC:       # %bb.0:
   2295 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
   2296 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
   2297 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2298 ;
   2299 ; SKX-LABEL: test_8xi64_perm_mem_mask0:
   2300 ; SKX:       # %bb.0:
   2301 ; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
   2302 ; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
   2303 ; SKX-NEXT:    retq # sched: [7:1.00]
   2304   %vec = load <8 x i64>, <8 x i64>* %vp
   2305   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   2306   ret <8 x i64> %res
   2307 }
   2308 define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
   2309 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
   2310 ; GENERIC:       # %bb.0:
   2311 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
   2312 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2313 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   2314 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2315 ;
   2316 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
   2317 ; SKX:       # %bb.0:
   2318 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
   2319 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2320 ; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   2321 ; SKX-NEXT:    retq # sched: [7:1.00]
   2322   %vec = load <8 x i64>, <8 x i64>* %vp
   2323   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   2324   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2325   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2326   ret <8 x i64> %res
   2327 }
   2328 
   2329 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
   2330 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
   2331 ; GENERIC:       # %bb.0:
   2332 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
   2333 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   2334 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   2335 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2336 ;
   2337 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
   2338 ; SKX:       # %bb.0:
   2339 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
   2340 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   2341 ; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   2342 ; SKX-NEXT:    retq # sched: [7:1.00]
   2343   %vec = load <8 x i64>, <8 x i64>* %vp
   2344   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   2345   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2346   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2347   ret <8 x i64> %res
   2348 }
   2349 
   2350 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
   2351 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
   2352 ; GENERIC:       # %bb.0:
   2353 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2354 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
   2355 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2356 ;
   2357 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
   2358 ; SKX:       # %bb.0:
   2359 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2360 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
   2361 ; SKX-NEXT:    retq # sched: [7:1.00]
   2362   %vec = load <8 x i64>, <8 x i64>* %vp
   2363   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
   2364   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2365   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2366   ret <8 x i64> %res
   2367 }
   2368 
   2369 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
   2370 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
   2371 ; GENERIC:       # %bb.0:
   2372 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   2373 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
   2374 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2375 ;
   2376 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
   2377 ; SKX:       # %bb.0:
   2378 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   2379 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
   2380 ; SKX-NEXT:    retq # sched: [7:1.00]
   2381   %vec = load <8 x i64>, <8 x i64>* %vp
   2382   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
   2383   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2384   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2385   ret <8 x i64> %res
   2386 }
   2387 
   2388 define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
   2389 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
   2390 ; GENERIC:       # %bb.0:
   2391 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [7:0.50]
   2392 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2393 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   2394 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2395 ;
   2396 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
   2397 ; SKX:       # %bb.0:
   2398 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
   2399 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2400 ; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   2401 ; SKX-NEXT:    retq # sched: [7:1.00]
   2402   %vec = load <8 x i64>, <8 x i64>* %vp
   2403   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
   2404   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2405   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2406   ret <8 x i64> %res
   2407 }
   2408 
   2409 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
   2410 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
   2411 ; GENERIC:       # %bb.0:
   2412 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [7:0.50]
   2413 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   2414 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   2415 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2416 ;
   2417 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
   2418 ; SKX:       # %bb.0:
   2419 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
   2420 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   2421 ; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   2422 ; SKX-NEXT:    retq # sched: [7:1.00]
   2423   %vec = load <8 x i64>, <8 x i64>* %vp
   2424   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
   2425   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2426   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2427   ret <8 x i64> %res
   2428 }
   2429 
   2430 define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
   2431 ; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3:
   2432 ; GENERIC:       # %bb.0:
   2433 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
   2434 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2435 ;
   2436 ; SKX-LABEL: test_8xi64_perm_imm_mem_mask3:
   2437 ; SKX:       # %bb.0:
   2438 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
   2439 ; SKX-NEXT:    retq # sched: [7:1.00]
   2440   %vec = load <8 x i64>, <8 x i64>* %vp
   2441   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   2442   ret <8 x i64> %res
   2443 }
   2444 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
   2445 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
   2446 ; GENERIC:       # %bb.0:
   2447 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2448 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
   2449 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2450 ;
   2451 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
   2452 ; SKX:       # %bb.0:
   2453 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2454 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
   2455 ; SKX-NEXT:    retq # sched: [7:1.00]
   2456   %vec = load <8 x i64>, <8 x i64>* %vp
   2457   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   2458   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2459   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2460   ret <8 x i64> %res
   2461 }
   2462 
   2463 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
   2464 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
   2465 ; GENERIC:       # %bb.0:
   2466 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   2467 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
   2468 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2469 ;
   2470 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
   2471 ; SKX:       # %bb.0:
   2472 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   2473 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
   2474 ; SKX-NEXT:    retq # sched: [7:1.00]
   2475   %vec = load <8 x i64>, <8 x i64>* %vp
   2476   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   2477   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2478   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2479   ret <8 x i64> %res
   2480 }
   2481 
   2482 define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
   2483 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
   2484 ; GENERIC:       # %bb.0:
   2485 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [7:0.50]
   2486 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2487 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   2488 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2489 ;
   2490 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
   2491 ; SKX:       # %bb.0:
   2492 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
   2493 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2494 ; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   2495 ; SKX-NEXT:    retq # sched: [7:1.00]
   2496   %vec = load <8 x i64>, <8 x i64>* %vp
   2497   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
   2498   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2499   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2500   ret <8 x i64> %res
   2501 }
   2502 
   2503 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
   2504 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
   2505 ; GENERIC:       # %bb.0:
   2506 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [7:0.50]
   2507 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   2508 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   2509 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2510 ;
   2511 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
   2512 ; SKX:       # %bb.0:
   2513 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
   2514 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   2515 ; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   2516 ; SKX-NEXT:    retq # sched: [7:1.00]
   2517   %vec = load <8 x i64>, <8 x i64>* %vp
   2518   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
   2519   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2520   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2521   ret <8 x i64> %res
   2522 }
   2523 
   2524 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
   2525 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
   2526 ; GENERIC:       # %bb.0:
   2527 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2528 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
   2529 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2530 ;
   2531 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
   2532 ; SKX:       # %bb.0:
   2533 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2534 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
   2535 ; SKX-NEXT:    retq # sched: [7:1.00]
   2536   %vec = load <8 x i64>, <8 x i64>* %vp
   2537   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
   2538   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2539   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2540   ret <8 x i64> %res
   2541 }
   2542 
   2543 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
   2544 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
   2545 ; GENERIC:       # %bb.0:
   2546 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   2547 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
   2548 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2549 ;
   2550 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
   2551 ; SKX:       # %bb.0:
   2552 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   2553 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
   2554 ; SKX-NEXT:    retq # sched: [7:1.00]
   2555   %vec = load <8 x i64>, <8 x i64>* %vp
   2556   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
   2557   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2558   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2559   ret <8 x i64> %res
   2560 }
   2561 
   2562 define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
   2563 ; GENERIC-LABEL: test_8xi64_perm_mem_mask6:
   2564 ; GENERIC:       # %bb.0:
   2565 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
   2566 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
   2567 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2568 ;
   2569 ; SKX-LABEL: test_8xi64_perm_mem_mask6:
   2570 ; SKX:       # %bb.0:
   2571 ; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
   2572 ; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
   2573 ; SKX-NEXT:    retq # sched: [7:1.00]
   2574   %vec = load <8 x i64>, <8 x i64>* %vp
   2575   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   2576   ret <8 x i64> %res
   2577 }
   2578 define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
   2579 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
   2580 ; GENERIC:       # %bb.0:
   2581 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
   2582 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2583 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   2584 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2585 ;
   2586 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
   2587 ; SKX:       # %bb.0:
   2588 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
   2589 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2590 ; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   2591 ; SKX-NEXT:    retq # sched: [7:1.00]
   2592   %vec = load <8 x i64>, <8 x i64>* %vp
   2593   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   2594   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2595   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2596   ret <8 x i64> %res
   2597 }
   2598 
   2599 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
   2600 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
   2601 ; GENERIC:       # %bb.0:
   2602 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
   2603 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   2604 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   2605 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2606 ;
   2607 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
   2608 ; SKX:       # %bb.0:
   2609 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
   2610 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   2611 ; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   2612 ; SKX-NEXT:    retq # sched: [7:1.00]
   2613   %vec = load <8 x i64>, <8 x i64>* %vp
   2614   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   2615   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2616   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2617   ret <8 x i64> %res
   2618 }
   2619 
   2620 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
   2621 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
   2622 ; GENERIC:       # %bb.0:
   2623 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2624 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
   2625 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2626 ;
   2627 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
   2628 ; SKX:       # %bb.0:
   2629 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2630 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
   2631 ; SKX-NEXT:    retq # sched: [7:1.00]
   2632   %vec = load <8 x i64>, <8 x i64>* %vp
   2633   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
   2634   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2635   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   2636   ret <8 x i64> %res
   2637 }
   2638 
   2639 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
   2640 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
   2641 ; GENERIC:       # %bb.0:
   2642 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   2643 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
   2644 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2645 ;
   2646 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
   2647 ; SKX:       # %bb.0:
   2648 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   2649 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
   2650 ; SKX-NEXT:    retq # sched: [7:1.00]
   2651   %vec = load <8 x i64>, <8 x i64>* %vp
   2652   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
   2653   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2654   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   2655   ret <8 x i64> %res
   2656 }
   2657 
   2658 define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
   2659 ; GENERIC-LABEL: test_8xfloat_perm_mask0:
   2660 ; GENERIC:       # %bb.0:
   2661 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
   2662 ; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
   2663 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2664 ;
   2665 ; SKX-LABEL: test_8xfloat_perm_mask0:
   2666 ; SKX:       # %bb.0:
   2667 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
   2668 ; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
   2669 ; SKX-NEXT:    retq # sched: [7:1.00]
   2670   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   2671   ret <8 x float> %res
   2672 }
   2673 define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
   2674 ; GENERIC-LABEL: test_masked_8xfloat_perm_mask0:
   2675 ; GENERIC:       # %bb.0:
   2676 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
   2677 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   2678 ; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
   2679 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   2680 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2681 ;
   2682 ; SKX-LABEL: test_masked_8xfloat_perm_mask0:
   2683 ; SKX:       # %bb.0:
   2684 ; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
   2685 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   2686 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
   2687 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   2688 ; SKX-NEXT:    retq # sched: [7:1.00]
   2689   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   2690   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2691   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   2692   ret <8 x float> %res
   2693 }
   2694 
   2695 define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> %mask) {
   2696 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0:
   2697 ; GENERIC:       # %bb.0:
   2698 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
   2699 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   2700 ; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
   2701 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2702 ;
   2703 ; SKX-LABEL: test_masked_z_8xfloat_perm_mask0:
   2704 ; SKX:       # %bb.0:
   2705 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
   2706 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   2707 ; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
   2708 ; SKX-NEXT:    retq # sched: [7:1.00]
   2709   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   2710   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2711   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   2712   ret <8 x float> %res
   2713 }
   2714 define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
   2715 ; GENERIC-LABEL: test_masked_8xfloat_perm_mask1:
   2716 ; GENERIC:       # %bb.0:
   2717 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
   2718 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   2719 ; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
   2720 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   2721 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2722 ;
   2723 ; SKX-LABEL: test_masked_8xfloat_perm_mask1:
   2724 ; SKX:       # %bb.0:
   2725 ; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
   2726 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   2727 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
   2728 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   2729 ; SKX-NEXT:    retq # sched: [7:1.00]
   2730   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
   2731   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2732   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   2733   ret <8 x float> %res
   2734 }
   2735 
   2736 define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> %mask) {
   2737 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1:
   2738 ; GENERIC:       # %bb.0:
   2739 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
   2740 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   2741 ; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
   2742 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2743 ;
   2744 ; SKX-LABEL: test_masked_z_8xfloat_perm_mask1:
   2745 ; SKX:       # %bb.0:
   2746 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
   2747 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   2748 ; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
   2749 ; SKX-NEXT:    retq # sched: [7:1.00]
   2750   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
   2751   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   2752   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   2753   ret <8 x float> %res
   2754 }
   2755 define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
   2756 ; GENERIC-LABEL: test_masked_8xfloat_perm_mask2:
   2757 ; GENERIC:       # %bb.0:
   2758 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
   2759 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   2760 ; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
   2761 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   2762 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2763 ;
   2764 ; SKX-LABEL: test_masked_8xfloat_perm_mask2:
   2765 ; SKX:       # %bb.0:
   2766 ; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
   2767 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   2768 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
   2769 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   2770 ; SKX-NEXT:    retq # sched: [7:1.00]
   2771   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
   2772   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2773   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   2774   ret <8 x float> %res
   2775 }
   2776 
   2777 define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> %mask) {
   2778 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2:
   2779 ; GENERIC:       # %bb.0:
   2780 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
   2781 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   2782 ; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
   2783 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2784 ;
   2785 ; SKX-LABEL: test_masked_z_8xfloat_perm_mask2:
   2786 ; SKX:       # %bb.0:
   2787 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
   2788 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   2789 ; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
   2790 ; SKX-NEXT:    retq # sched: [7:1.00]
   2791   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
   2792   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2793   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   2794   ret <8 x float> %res
   2795 }
   2796 define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
   2797 ; GENERIC-LABEL: test_8xfloat_perm_mask3:
   2798 ; GENERIC:       # %bb.0:
   2799 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
   2800 ; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
   2801 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2802 ;
   2803 ; SKX-LABEL: test_8xfloat_perm_mask3:
   2804 ; SKX:       # %bb.0:
   2805 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
   2806 ; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
   2807 ; SKX-NEXT:    retq # sched: [7:1.00]
   2808   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   2809   ret <8 x float> %res
   2810 }
   2811 define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
   2812 ; GENERIC-LABEL: test_masked_8xfloat_perm_mask3:
   2813 ; GENERIC:       # %bb.0:
   2814 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
   2815 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   2816 ; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
   2817 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   2818 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2819 ;
   2820 ; SKX-LABEL: test_masked_8xfloat_perm_mask3:
   2821 ; SKX:       # %bb.0:
   2822 ; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
   2823 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   2824 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
   2825 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   2826 ; SKX-NEXT:    retq # sched: [7:1.00]
   2827   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   2828   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2829   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   2830   ret <8 x float> %res
   2831 }
   2832 
   2833 define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> %mask) {
   2834 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3:
   2835 ; GENERIC:       # %bb.0:
   2836 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
   2837 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   2838 ; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
   2839 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2840 ;
   2841 ; SKX-LABEL: test_masked_z_8xfloat_perm_mask3:
   2842 ; SKX:       # %bb.0:
   2843 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
   2844 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   2845 ; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
   2846 ; SKX-NEXT:    retq # sched: [7:1.00]
   2847   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   2848   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2849   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   2850   ret <8 x float> %res
   2851 }
   2852 define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
   2853 ; GENERIC-LABEL: test_8xfloat_perm_mem_mask0:
   2854 ; GENERIC:       # %bb.0:
   2855 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
   2856 ; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   2857 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2858 ;
   2859 ; SKX-LABEL: test_8xfloat_perm_mem_mask0:
   2860 ; SKX:       # %bb.0:
   2861 ; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
   2862 ; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2863 ; SKX-NEXT:    retq # sched: [7:1.00]
   2864   %vec = load <8 x float>, <8 x float>* %vp
   2865   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   2866   ret <8 x float> %res
   2867 }
   2868 define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
   2869 ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0:
   2870 ; GENERIC:       # %bb.0:
   2871 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
   2872 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   2873 ; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
   2874 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2875 ;
   2876 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
   2877 ; SKX:       # %bb.0:
   2878 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
   2879 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   2880 ; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
   2881 ; SKX-NEXT:    retq # sched: [7:1.00]
   2882   %vec = load <8 x float>, <8 x float>* %vp
   2883   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   2884   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2885   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   2886   ret <8 x float> %res
   2887 }
   2888 
   2889 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x i32> %mask) {
   2890 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
   2891 ; GENERIC:       # %bb.0:
   2892 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
   2893 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   2894 ; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
   2895 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2896 ;
   2897 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
   2898 ; SKX:       # %bb.0:
   2899 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
   2900 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   2901 ; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
   2902 ; SKX-NEXT:    retq # sched: [7:1.00]
   2903   %vec = load <8 x float>, <8 x float>* %vp
   2904   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   2905   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2906   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   2907   ret <8 x float> %res
   2908 }
   2909 
   2910 define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
   2911 ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1:
   2912 ; GENERIC:       # %bb.0:
   2913 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
   2914 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   2915 ; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
   2916 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2917 ;
   2918 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
   2919 ; SKX:       # %bb.0:
   2920 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
   2921 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   2922 ; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
   2923 ; SKX-NEXT:    retq # sched: [7:1.00]
   2924   %vec = load <8 x float>, <8 x float>* %vp
   2925   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
   2926   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2927   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   2928   ret <8 x float> %res
   2929 }
   2930 
   2931 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x i32> %mask) {
   2932 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
   2933 ; GENERIC:       # %bb.0:
   2934 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
   2935 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   2936 ; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
   2937 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2938 ;
   2939 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
   2940 ; SKX:       # %bb.0:
   2941 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
   2942 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   2943 ; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
   2944 ; SKX-NEXT:    retq # sched: [7:1.00]
   2945   %vec = load <8 x float>, <8 x float>* %vp
   2946   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
   2947   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2948   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   2949   ret <8 x float> %res
   2950 }
   2951 
   2952 define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
   2953 ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2:
   2954 ; GENERIC:       # %bb.0:
   2955 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
   2956 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   2957 ; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
   2958 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2959 ;
   2960 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
   2961 ; SKX:       # %bb.0:
   2962 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
   2963 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   2964 ; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
   2965 ; SKX-NEXT:    retq # sched: [7:1.00]
   2966   %vec = load <8 x float>, <8 x float>* %vp
   2967   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
   2968   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2969   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   2970   ret <8 x float> %res
   2971 }
   2972 
   2973 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x i32> %mask) {
   2974 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
   2975 ; GENERIC:       # %bb.0:
   2976 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
   2977 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   2978 ; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
   2979 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2980 ;
   2981 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
   2982 ; SKX:       # %bb.0:
   2983 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
   2984 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   2985 ; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
   2986 ; SKX-NEXT:    retq # sched: [7:1.00]
   2987   %vec = load <8 x float>, <8 x float>* %vp
   2988   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
   2989   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   2990   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   2991   ret <8 x float> %res
   2992 }
   2993 
   2994 define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
   2995 ; GENERIC-LABEL: test_8xfloat_perm_mem_mask3:
   2996 ; GENERIC:       # %bb.0:
   2997 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
   2998 ; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   2999 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3000 ;
   3001 ; SKX-LABEL: test_8xfloat_perm_mem_mask3:
   3002 ; SKX:       # %bb.0:
   3003 ; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
   3004 ; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   3005 ; SKX-NEXT:    retq # sched: [7:1.00]
   3006   %vec = load <8 x float>, <8 x float>* %vp
   3007   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   3008   ret <8 x float> %res
   3009 }
   3010 define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
   3011 ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3:
   3012 ; GENERIC:       # %bb.0:
   3013 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
   3014 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3015 ; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
   3016 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3017 ;
   3018 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
   3019 ; SKX:       # %bb.0:
   3020 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
   3021 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3022 ; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
   3023 ; SKX-NEXT:    retq # sched: [7:1.00]
   3024   %vec = load <8 x float>, <8 x float>* %vp
   3025   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   3026   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   3027   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   3028   ret <8 x float> %res
   3029 }
   3030 
   3031 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
   3032 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
   3033 ; GENERIC:       # %bb.0:
   3034 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
   3035 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   3036 ; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
   3037 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3038 ;
   3039 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
   3040 ; SKX:       # %bb.0:
   3041 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
   3042 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   3043 ; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
   3044 ; SKX-NEXT:    retq # sched: [7:1.00]
   3045   %vec = load <8 x float>, <8 x float>* %vp
   3046   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   3047   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   3048   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   3049   ret <8 x float> %res
   3050 }
   3051 
   3052 define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
   3053 ; GENERIC-LABEL: test_16xfloat_perm_mask0:
   3054 ; GENERIC:       # %bb.0:
   3055 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
   3056 ; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
   3057 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3058 ;
   3059 ; SKX-LABEL: test_16xfloat_perm_mask0:
   3060 ; SKX:       # %bb.0:
   3061 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
   3062 ; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
   3063 ; SKX-NEXT:    retq # sched: [7:1.00]
   3064   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   3065   ret <16 x float> %res
   3066 }
   3067 define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
   3068 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
   3069 ; GENERIC:       # %bb.0:
   3070 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
   3071 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3072 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   3073 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   3074 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3075 ;
   3076 ; SKX-LABEL: test_masked_16xfloat_perm_mask0:
   3077 ; SKX:       # %bb.0:
   3078 ; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
   3079 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3080 ; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   3081 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   3082 ; SKX-NEXT:    retq # sched: [7:1.00]
   3083   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   3084   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3085   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   3086   ret <16 x float> %res
   3087 }
   3088 
   3089 define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) {
   3090 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
   3091 ; GENERIC:       # %bb.0:
   3092 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
   3093 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3094 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   3095 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3096 ;
   3097 ; SKX-LABEL: test_masked_z_16xfloat_perm_mask0:
   3098 ; SKX:       # %bb.0:
   3099 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
   3100 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3101 ; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   3102 ; SKX-NEXT:    retq # sched: [7:1.00]
   3103   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   3104   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3105   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   3106   ret <16 x float> %res
   3107 }
   3108 define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
   3109 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
   3110 ; GENERIC:       # %bb.0:
   3111 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50]
   3112 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3113 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   3114 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   3115 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3116 ;
   3117 ; SKX-LABEL: test_masked_16xfloat_perm_mask1:
   3118 ; SKX:       # %bb.0:
   3119 ; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
   3120 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3121 ; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   3122 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   3123 ; SKX-NEXT:    retq # sched: [7:1.00]
   3124   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
   3125   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3126   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   3127   ret <16 x float> %res
   3128 }
   3129 
   3130 define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) {
   3131 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
   3132 ; GENERIC:       # %bb.0:
   3133 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50]
   3134 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3135 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   3136 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3137 ;
   3138 ; SKX-LABEL: test_masked_z_16xfloat_perm_mask1:
   3139 ; SKX:       # %bb.0:
   3140 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
   3141 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3142 ; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   3143 ; SKX-NEXT:    retq # sched: [7:1.00]
   3144   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
   3145   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3146   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   3147   ret <16 x float> %res
   3148 }
   3149 define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
   3150 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
   3151 ; GENERIC:       # %bb.0:
   3152 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50]
   3153 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3154 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   3155 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   3156 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3157 ;
   3158 ; SKX-LABEL: test_masked_16xfloat_perm_mask2:
   3159 ; SKX:       # %bb.0:
   3160 ; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
   3161 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3162 ; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   3163 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   3164 ; SKX-NEXT:    retq # sched: [7:1.00]
   3165   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
   3166   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3167   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   3168   ret <16 x float> %res
   3169 }
   3170 
   3171 define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) {
   3172 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
   3173 ; GENERIC:       # %bb.0:
   3174 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50]
   3175 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3176 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   3177 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3178 ;
   3179 ; SKX-LABEL: test_masked_z_16xfloat_perm_mask2:
   3180 ; SKX:       # %bb.0:
   3181 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
   3182 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3183 ; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   3184 ; SKX-NEXT:    retq # sched: [7:1.00]
   3185   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
   3186   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3187   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   3188   ret <16 x float> %res
   3189 }
   3190 define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
   3191 ; GENERIC-LABEL: test_16xfloat_perm_mask3:
   3192 ; GENERIC:       # %bb.0:
   3193 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
   3194 ; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
   3195 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3196 ;
   3197 ; SKX-LABEL: test_16xfloat_perm_mask3:
   3198 ; SKX:       # %bb.0:
   3199 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
   3200 ; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
   3201 ; SKX-NEXT:    retq # sched: [7:1.00]
   3202   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   3203   ret <16 x float> %res
   3204 }
   3205 define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
   3206 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
   3207 ; GENERIC:       # %bb.0:
   3208 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
   3209 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3210 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   3211 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   3212 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3213 ;
   3214 ; SKX-LABEL: test_masked_16xfloat_perm_mask3:
   3215 ; SKX:       # %bb.0:
   3216 ; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
   3217 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3218 ; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   3219 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   3220 ; SKX-NEXT:    retq # sched: [7:1.00]
   3221   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   3222   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3223   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   3224   ret <16 x float> %res
   3225 }
   3226 
   3227 define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) {
   3228 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
   3229 ; GENERIC:       # %bb.0:
   3230 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
   3231 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3232 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   3233 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3234 ;
   3235 ; SKX-LABEL: test_masked_z_16xfloat_perm_mask3:
   3236 ; SKX:       # %bb.0:
   3237 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
   3238 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3239 ; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   3240 ; SKX-NEXT:    retq # sched: [7:1.00]
   3241   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   3242   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3243   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   3244   ret <16 x float> %res
   3245 }
   3246 define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
   3247 ; GENERIC-LABEL: test_16xfloat_perm_mem_mask0:
   3248 ; GENERIC:       # %bb.0:
   3249 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
   3250 ; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
   3251 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3252 ;
   3253 ; SKX-LABEL: test_16xfloat_perm_mem_mask0:
   3254 ; SKX:       # %bb.0:
   3255 ; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
   3256 ; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
   3257 ; SKX-NEXT:    retq # sched: [7:1.00]
   3258   %vec = load <16 x float>, <16 x float>* %vp
   3259   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   3260   ret <16 x float> %res
   3261 }
   3262 define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
   3263 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
   3264 ; GENERIC:       # %bb.0:
   3265 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
   3266 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3267 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   3268 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3269 ;
   3270 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
   3271 ; SKX:       # %bb.0:
   3272 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
   3273 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3274 ; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   3275 ; SKX-NEXT:    retq # sched: [7:1.00]
   3276   %vec = load <16 x float>, <16 x float>* %vp
   3277   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   3278   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3279   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   3280   ret <16 x float> %res
   3281 }
   3282 
   3283 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) {
   3284 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
   3285 ; GENERIC:       # %bb.0:
   3286 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
   3287 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   3288 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   3289 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3290 ;
   3291 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
   3292 ; SKX:       # %bb.0:
   3293 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
   3294 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   3295 ; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   3296 ; SKX-NEXT:    retq # sched: [7:1.00]
   3297   %vec = load <16 x float>, <16 x float>* %vp
   3298   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   3299   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3300   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   3301   ret <16 x float> %res
   3302 }
   3303 
   3304 define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
   3305 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
   3306 ; GENERIC:       # %bb.0:
   3307 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50]
   3308 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3309 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   3310 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3311 ;
   3312 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
   3313 ; SKX:       # %bb.0:
   3314 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
   3315 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3316 ; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   3317 ; SKX-NEXT:    retq # sched: [7:1.00]
   3318   %vec = load <16 x float>, <16 x float>* %vp
   3319   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
   3320   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3321   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   3322   ret <16 x float> %res
   3323 }
   3324 
   3325 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) {
   3326 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
   3327 ; GENERIC:       # %bb.0:
   3328 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50]
   3329 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   3330 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   3331 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3332 ;
   3333 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
   3334 ; SKX:       # %bb.0:
   3335 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
   3336 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   3337 ; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   3338 ; SKX-NEXT:    retq # sched: [7:1.00]
   3339   %vec = load <16 x float>, <16 x float>* %vp
   3340   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
   3341   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3342   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   3343   ret <16 x float> %res
   3344 }
   3345 
   3346 define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
   3347 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
   3348 ; GENERIC:       # %bb.0:
   3349 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50]
   3350 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3351 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   3352 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3353 ;
   3354 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
   3355 ; SKX:       # %bb.0:
   3356 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
   3357 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3358 ; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   3359 ; SKX-NEXT:    retq # sched: [7:1.00]
   3360   %vec = load <16 x float>, <16 x float>* %vp
   3361   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
   3362   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3363   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   3364   ret <16 x float> %res
   3365 }
   3366 
   3367 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) {
   3368 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
   3369 ; GENERIC:       # %bb.0:
   3370 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50]
   3371 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   3372 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   3373 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3374 ;
   3375 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
   3376 ; SKX:       # %bb.0:
   3377 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
   3378 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   3379 ; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   3380 ; SKX-NEXT:    retq # sched: [7:1.00]
   3381   %vec = load <16 x float>, <16 x float>* %vp
   3382   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
   3383   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3384   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   3385   ret <16 x float> %res
   3386 }
   3387 
   3388 define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
   3389 ; GENERIC-LABEL: test_16xfloat_perm_mem_mask3:
   3390 ; GENERIC:       # %bb.0:
   3391 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
   3392 ; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
   3393 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3394 ;
   3395 ; SKX-LABEL: test_16xfloat_perm_mem_mask3:
   3396 ; SKX:       # %bb.0:
   3397 ; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
   3398 ; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
   3399 ; SKX-NEXT:    retq # sched: [7:1.00]
   3400   %vec = load <16 x float>, <16 x float>* %vp
   3401   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   3402   ret <16 x float> %res
   3403 }
   3404 define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
   3405 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
   3406 ; GENERIC:       # %bb.0:
   3407 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
   3408 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3409 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   3410 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3411 ;
   3412 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
   3413 ; SKX:       # %bb.0:
   3414 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
   3415 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3416 ; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   3417 ; SKX-NEXT:    retq # sched: [7:1.00]
   3418   %vec = load <16 x float>, <16 x float>* %vp
   3419   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   3420   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3421   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   3422   ret <16 x float> %res
   3423 }
   3424 
   3425 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) {
   3426 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
   3427 ; GENERIC:       # %bb.0:
   3428 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
   3429 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   3430 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   3431 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3432 ;
   3433 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
   3434 ; SKX:       # %bb.0:
   3435 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
   3436 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   3437 ; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   3438 ; SKX-NEXT:    retq # sched: [7:1.00]
   3439   %vec = load <16 x float>, <16 x float>* %vp
   3440   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   3441   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   3442   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   3443   ret <16 x float> %res
   3444 }
   3445 
   3446 define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
   3447 ; GENERIC-LABEL: test_4xdouble_perm_mask0:
   3448 ; GENERIC:       # %bb.0:
   3449 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [1:1.00]
   3450 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3451 ;
   3452 ; SKX-LABEL: test_4xdouble_perm_mask0:
   3453 ; SKX:       # %bb.0:
   3454 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00]
   3455 ; SKX-NEXT:    retq # sched: [7:1.00]
   3456   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   3457   ret <4 x double> %res
   3458 }
   3459 define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
   3460 ; GENERIC-LABEL: test_masked_4xdouble_perm_mask0:
   3461 ; GENERIC:       # %bb.0:
   3462 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   3463 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00]
   3464 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   3465 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3466 ;
   3467 ; SKX-LABEL: test_masked_4xdouble_perm_mask0:
   3468 ; SKX:       # %bb.0:
   3469 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   3470 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
   3471 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   3472 ; SKX-NEXT:    retq # sched: [7:1.00]
   3473   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   3474   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3475   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   3476   ret <4 x double> %res
   3477 }
   3478 
   3479 define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) {
   3480 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0:
   3481 ; GENERIC:       # %bb.0:
   3482 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3483 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00]
   3484 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3485 ;
   3486 ; SKX-LABEL: test_masked_z_4xdouble_perm_mask0:
   3487 ; SKX:       # %bb.0:
   3488 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3489 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
   3490 ; SKX-NEXT:    retq # sched: [7:1.00]
   3491   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   3492   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3493   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   3494   ret <4 x double> %res
   3495 }
   3496 define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
   3497 ; GENERIC-LABEL: test_masked_4xdouble_perm_mask1:
   3498 ; GENERIC:       # %bb.0:
   3499 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   3500 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00]
   3501 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   3502 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3503 ;
   3504 ; SKX-LABEL: test_masked_4xdouble_perm_mask1:
   3505 ; SKX:       # %bb.0:
   3506 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   3507 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
   3508 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   3509 ; SKX-NEXT:    retq # sched: [7:1.00]
   3510   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   3511   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3512   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   3513   ret <4 x double> %res
   3514 }
   3515 
   3516 define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) {
   3517 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1:
   3518 ; GENERIC:       # %bb.0:
   3519 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3520 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00]
   3521 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3522 ;
   3523 ; SKX-LABEL: test_masked_z_4xdouble_perm_mask1:
   3524 ; SKX:       # %bb.0:
   3525 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3526 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
   3527 ; SKX-NEXT:    retq # sched: [7:1.00]
   3528   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   3529   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3530   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   3531   ret <4 x double> %res
   3532 }
   3533 define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
   3534 ; GENERIC-LABEL: test_masked_4xdouble_perm_mask2:
   3535 ; GENERIC:       # %bb.0:
   3536 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   3537 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00]
   3538 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   3539 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3540 ;
   3541 ; SKX-LABEL: test_masked_4xdouble_perm_mask2:
   3542 ; SKX:       # %bb.0:
   3543 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   3544 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
   3545 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   3546 ; SKX-NEXT:    retq # sched: [7:1.00]
   3547   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
   3548   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3549   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   3550   ret <4 x double> %res
   3551 }
   3552 
   3553 define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) {
   3554 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2:
   3555 ; GENERIC:       # %bb.0:
   3556 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3557 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00]
   3558 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3559 ;
   3560 ; SKX-LABEL: test_masked_z_4xdouble_perm_mask2:
   3561 ; SKX:       # %bb.0:
   3562 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3563 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
   3564 ; SKX-NEXT:    retq # sched: [7:1.00]
   3565   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
   3566   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3567   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   3568   ret <4 x double> %res
   3569 }
   3570 define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
   3571 ; GENERIC-LABEL: test_4xdouble_perm_mask3:
   3572 ; GENERIC:       # %bb.0:
   3573 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [1:1.00]
   3574 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3575 ;
   3576 ; SKX-LABEL: test_4xdouble_perm_mask3:
   3577 ; SKX:       # %bb.0:
   3578 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00]
   3579 ; SKX-NEXT:    retq # sched: [7:1.00]
   3580   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   3581   ret <4 x double> %res
   3582 }
   3583 define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
   3584 ; GENERIC-LABEL: test_masked_4xdouble_perm_mask3:
   3585 ; GENERIC:       # %bb.0:
   3586 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   3587 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00]
   3588 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   3589 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3590 ;
   3591 ; SKX-LABEL: test_masked_4xdouble_perm_mask3:
   3592 ; SKX:       # %bb.0:
   3593 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   3594 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
   3595 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   3596 ; SKX-NEXT:    retq # sched: [7:1.00]
   3597   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   3598   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3599   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   3600   ret <4 x double> %res
   3601 }
   3602 
   3603 define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) {
   3604 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3:
   3605 ; GENERIC:       # %bb.0:
   3606 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3607 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00]
   3608 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3609 ;
   3610 ; SKX-LABEL: test_masked_z_4xdouble_perm_mask3:
   3611 ; SKX:       # %bb.0:
   3612 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3613 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
   3614 ; SKX-NEXT:    retq # sched: [7:1.00]
   3615   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   3616   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3617   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   3618   ret <4 x double> %res
   3619 }
   3620 define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
   3621 ; GENERIC-LABEL: test_4xdouble_perm_mem_mask0:
   3622 ; GENERIC:       # %bb.0:
   3623 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [8:1.00]
   3624 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3625 ;
   3626 ; SKX-LABEL: test_4xdouble_perm_mem_mask0:
   3627 ; SKX:       # %bb.0:
   3628 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00]
   3629 ; SKX-NEXT:    retq # sched: [7:1.00]
   3630   %vec = load <4 x double>, <4 x double>* %vp
   3631   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   3632   ret <4 x double> %res
   3633 }
   3634 define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
   3635 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
   3636 ; GENERIC:       # %bb.0:
   3637 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3638 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [8:1.00]
   3639 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3640 ;
   3641 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
   3642 ; SKX:       # %bb.0:
   3643 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3644 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
   3645 ; SKX-NEXT:    retq # sched: [7:1.00]
   3646   %vec = load <4 x double>, <4 x double>* %vp
   3647   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   3648   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3649   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   3650   ret <4 x double> %res
   3651 }
   3652 
   3653 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) {
   3654 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
   3655 ; GENERIC:       # %bb.0:
   3656 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
   3657 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [8:1.00]
   3658 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3659 ;
   3660 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
   3661 ; SKX:       # %bb.0:
   3662 ; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
   3663 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
   3664 ; SKX-NEXT:    retq # sched: [7:1.00]
   3665   %vec = load <4 x double>, <4 x double>* %vp
   3666   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   3667   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3668   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   3669   ret <4 x double> %res
   3670 }
   3671 
   3672 define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
   3673 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
   3674 ; GENERIC:       # %bb.0:
   3675 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3676 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [8:1.00]
   3677 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3678 ;
   3679 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
   3680 ; SKX:       # %bb.0:
   3681 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3682 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
   3683 ; SKX-NEXT:    retq # sched: [7:1.00]
   3684   %vec = load <4 x double>, <4 x double>* %vp
   3685   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
   3686   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3687   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   3688   ret <4 x double> %res
   3689 }
   3690 
   3691 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) {
   3692 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
   3693 ; GENERIC:       # %bb.0:
   3694 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
   3695 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [8:1.00]
   3696 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3697 ;
   3698 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
   3699 ; SKX:       # %bb.0:
   3700 ; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
   3701 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
   3702 ; SKX-NEXT:    retq # sched: [7:1.00]
   3703   %vec = load <4 x double>, <4 x double>* %vp
   3704   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
   3705   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3706   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   3707   ret <4 x double> %res
   3708 }
   3709 
   3710 define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
   3711 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
   3712 ; GENERIC:       # %bb.0:
   3713 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3714 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [8:1.00]
   3715 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3716 ;
   3717 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
   3718 ; SKX:       # %bb.0:
   3719 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3720 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
   3721 ; SKX-NEXT:    retq # sched: [7:1.00]
   3722   %vec = load <4 x double>, <4 x double>* %vp
   3723   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   3724   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3725   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   3726   ret <4 x double> %res
   3727 }
   3728 
   3729 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) {
   3730 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
   3731 ; GENERIC:       # %bb.0:
   3732 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
   3733 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [8:1.00]
   3734 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3735 ;
   3736 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
   3737 ; SKX:       # %bb.0:
   3738 ; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
   3739 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
   3740 ; SKX-NEXT:    retq # sched: [7:1.00]
   3741   %vec = load <4 x double>, <4 x double>* %vp
   3742   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   3743   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3744   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   3745   ret <4 x double> %res
   3746 }
   3747 
   3748 define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
   3749 ; GENERIC-LABEL: test_4xdouble_perm_mem_mask3:
   3750 ; GENERIC:       # %bb.0:
   3751 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [8:1.00]
   3752 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3753 ;
   3754 ; SKX-LABEL: test_4xdouble_perm_mem_mask3:
   3755 ; SKX:       # %bb.0:
   3756 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00]
   3757 ; SKX-NEXT:    retq # sched: [7:1.00]
   3758   %vec = load <4 x double>, <4 x double>* %vp
   3759   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   3760   ret <4 x double> %res
   3761 }
   3762 define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
   3763 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
   3764 ; GENERIC:       # %bb.0:
   3765 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   3766 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [8:1.00]
   3767 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3768 ;
   3769 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
   3770 ; SKX:       # %bb.0:
   3771 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   3772 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
   3773 ; SKX-NEXT:    retq # sched: [7:1.00]
   3774   %vec = load <4 x double>, <4 x double>* %vp
   3775   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   3776   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3777   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   3778   ret <4 x double> %res
   3779 }
   3780 
   3781 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) {
   3782 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
   3783 ; GENERIC:       # %bb.0:
   3784 ; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
   3785 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [8:1.00]
   3786 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3787 ;
   3788 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
   3789 ; SKX:       # %bb.0:
   3790 ; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
   3791 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
   3792 ; SKX-NEXT:    retq # sched: [7:1.00]
   3793   %vec = load <4 x double>, <4 x double>* %vp
   3794   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   3795   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   3796   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   3797   ret <4 x double> %res
   3798 }
   3799 
   3800 define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
   3801 ; GENERIC-LABEL: test_8xdouble_perm_mask0:
   3802 ; GENERIC:       # %bb.0:
   3803 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
   3804 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
   3805 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3806 ;
   3807 ; SKX-LABEL: test_8xdouble_perm_mask0:
   3808 ; SKX:       # %bb.0:
   3809 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
   3810 ; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
   3811 ; SKX-NEXT:    retq # sched: [7:1.00]
   3812   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   3813   ret <8 x double> %res
   3814 }
   3815 define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
   3816 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
   3817 ; GENERIC:       # %bb.0:
   3818 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
   3819 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3820 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   3821 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   3822 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3823 ;
   3824 ; SKX-LABEL: test_masked_8xdouble_perm_mask0:
   3825 ; SKX:       # %bb.0:
   3826 ; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
   3827 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3828 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   3829 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   3830 ; SKX-NEXT:    retq # sched: [7:1.00]
   3831   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   3832   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   3833   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   3834   ret <8 x double> %res
   3835 }
   3836 
   3837 define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) {
   3838 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
   3839 ; GENERIC:       # %bb.0:
   3840 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
   3841 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3842 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   3843 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3844 ;
   3845 ; SKX-LABEL: test_masked_z_8xdouble_perm_mask0:
   3846 ; SKX:       # %bb.0:
   3847 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
   3848 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3849 ; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   3850 ; SKX-NEXT:    retq # sched: [7:1.00]
   3851   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   3852   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   3853   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   3854   ret <8 x double> %res
   3855 }
   3856 define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
   3857 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1:
   3858 ; GENERIC:       # %bb.0:
   3859 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3860 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
   3861 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   3862 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3863 ;
   3864 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
   3865 ; SKX:       # %bb.0:
   3866 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3867 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
   3868 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   3869 ; SKX-NEXT:    retq # sched: [7:1.00]
   3870   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
   3871   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   3872   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   3873   ret <8 x double> %res
   3874 }
   3875 
   3876 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) {
   3877 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
   3878 ; GENERIC:       # %bb.0:
   3879 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3880 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
   3881 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3882 ;
   3883 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
   3884 ; SKX:       # %bb.0:
   3885 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3886 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
   3887 ; SKX-NEXT:    retq # sched: [7:1.00]
   3888   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
   3889   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   3890   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   3891   ret <8 x double> %res
   3892 }
   3893 define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
   3894 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
   3895 ; GENERIC:       # %bb.0:
   3896 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [7:0.50]
   3897 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3898 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   3899 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   3900 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3901 ;
   3902 ; SKX-LABEL: test_masked_8xdouble_perm_mask2:
   3903 ; SKX:       # %bb.0:
   3904 ; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
   3905 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3906 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   3907 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   3908 ; SKX-NEXT:    retq # sched: [7:1.00]
   3909   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
   3910   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   3911   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   3912   ret <8 x double> %res
   3913 }
   3914 
   3915 define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) {
   3916 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
   3917 ; GENERIC:       # %bb.0:
   3918 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [7:0.50]
   3919 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3920 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   3921 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3922 ;
   3923 ; SKX-LABEL: test_masked_z_8xdouble_perm_mask2:
   3924 ; SKX:       # %bb.0:
   3925 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
   3926 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3927 ; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   3928 ; SKX-NEXT:    retq # sched: [7:1.00]
   3929   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
   3930   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   3931   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   3932   ret <8 x double> %res
   3933 }
   3934 define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
   3935 ; GENERIC-LABEL: test_8xdouble_perm_imm_mask3:
   3936 ; GENERIC:       # %bb.0:
   3937 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
   3938 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3939 ;
   3940 ; SKX-LABEL: test_8xdouble_perm_imm_mask3:
   3941 ; SKX:       # %bb.0:
   3942 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
   3943 ; SKX-NEXT:    retq # sched: [7:1.00]
   3944   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   3945   ret <8 x double> %res
   3946 }
   3947 define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
   3948 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3:
   3949 ; GENERIC:       # %bb.0:
   3950 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3951 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
   3952 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   3953 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3954 ;
   3955 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
   3956 ; SKX:       # %bb.0:
   3957 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3958 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
   3959 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   3960 ; SKX-NEXT:    retq # sched: [7:1.00]
   3961   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   3962   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   3963   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   3964   ret <8 x double> %res
   3965 }
   3966 
   3967 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) {
   3968 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
   3969 ; GENERIC:       # %bb.0:
   3970 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   3971 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
   3972 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3973 ;
   3974 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
   3975 ; SKX:       # %bb.0:
   3976 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   3977 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
   3978 ; SKX-NEXT:    retq # sched: [7:1.00]
   3979   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   3980   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   3981   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   3982   ret <8 x double> %res
   3983 }
   3984 define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
   3985 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
   3986 ; GENERIC:       # %bb.0:
   3987 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [7:0.50]
   3988 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   3989 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   3990 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   3991 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3992 ;
   3993 ; SKX-LABEL: test_masked_8xdouble_perm_mask4:
   3994 ; SKX:       # %bb.0:
   3995 ; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
   3996 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   3997 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   3998 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   3999 ; SKX-NEXT:    retq # sched: [7:1.00]
   4000   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
   4001   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4002   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4003   ret <8 x double> %res
   4004 }
   4005 
   4006 define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) {
   4007 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
   4008 ; GENERIC:       # %bb.0:
   4009 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [7:0.50]
   4010 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4011 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   4012 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4013 ;
   4014 ; SKX-LABEL: test_masked_z_8xdouble_perm_mask4:
   4015 ; SKX:       # %bb.0:
   4016 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
   4017 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4018 ; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   4019 ; SKX-NEXT:    retq # sched: [7:1.00]
   4020   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
   4021   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4022   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4023   ret <8 x double> %res
   4024 }
   4025 define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
   4026 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5:
   4027 ; GENERIC:       # %bb.0:
   4028 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   4029 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
   4030 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   4031 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4032 ;
   4033 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
   4034 ; SKX:       # %bb.0:
   4035 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   4036 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
   4037 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   4038 ; SKX-NEXT:    retq # sched: [7:1.00]
   4039   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
   4040   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4041   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4042   ret <8 x double> %res
   4043 }
   4044 
   4045 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) {
   4046 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
   4047 ; GENERIC:       # %bb.0:
   4048 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4049 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
   4050 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4051 ;
   4052 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
   4053 ; SKX:       # %bb.0:
   4054 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4055 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
   4056 ; SKX-NEXT:    retq # sched: [7:1.00]
   4057   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
   4058   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4059   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4060   ret <8 x double> %res
   4061 }
   4062 define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
   4063 ; GENERIC-LABEL: test_8xdouble_perm_mask6:
   4064 ; GENERIC:       # %bb.0:
   4065 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
   4066 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
   4067 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4068 ;
   4069 ; SKX-LABEL: test_8xdouble_perm_mask6:
   4070 ; SKX:       # %bb.0:
   4071 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
   4072 ; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
   4073 ; SKX-NEXT:    retq # sched: [7:1.00]
   4074   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   4075   ret <8 x double> %res
   4076 }
   4077 define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
   4078 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
   4079 ; GENERIC:       # %bb.0:
   4080 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
   4081 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   4082 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
   4083 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   4084 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4085 ;
   4086 ; SKX-LABEL: test_masked_8xdouble_perm_mask6:
   4087 ; SKX:       # %bb.0:
   4088 ; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
   4089 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   4090 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
   4091 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   4092 ; SKX-NEXT:    retq # sched: [7:1.00]
   4093   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   4094   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4095   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4096   ret <8 x double> %res
   4097 }
   4098 
   4099 define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) {
   4100 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
   4101 ; GENERIC:       # %bb.0:
   4102 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
   4103 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4104 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
   4105 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4106 ;
   4107 ; SKX-LABEL: test_masked_z_8xdouble_perm_mask6:
   4108 ; SKX:       # %bb.0:
   4109 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
   4110 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4111 ; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
   4112 ; SKX-NEXT:    retq # sched: [7:1.00]
   4113   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   4114   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4115   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4116   ret <8 x double> %res
   4117 }
   4118 define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
   4119 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7:
   4120 ; GENERIC:       # %bb.0:
   4121 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   4122 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
   4123 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   4124 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4125 ;
   4126 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
   4127 ; SKX:       # %bb.0:
   4128 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   4129 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
   4130 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   4131 ; SKX-NEXT:    retq # sched: [7:1.00]
   4132   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
   4133   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4134   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4135   ret <8 x double> %res
   4136 }
   4137 
   4138 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) {
   4139 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
   4140 ; GENERIC:       # %bb.0:
   4141 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4142 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
   4143 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4144 ;
   4145 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
   4146 ; SKX:       # %bb.0:
   4147 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4148 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
   4149 ; SKX-NEXT:    retq # sched: [7:1.00]
   4150   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
   4151   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4152   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4153   ret <8 x double> %res
   4154 }
   4155 define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
   4156 ; GENERIC-LABEL: test_8xdouble_perm_mem_mask0:
   4157 ; GENERIC:       # %bb.0:
   4158 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
   4159 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
   4160 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4161 ;
   4162 ; SKX-LABEL: test_8xdouble_perm_mem_mask0:
   4163 ; SKX:       # %bb.0:
   4164 ; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
   4165 ; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
   4166 ; SKX-NEXT:    retq # sched: [7:1.00]
   4167   %vec = load <8 x double>, <8 x double>* %vp
   4168   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   4169   ret <8 x double> %res
   4170 }
   4171 define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
   4172 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
   4173 ; GENERIC:       # %bb.0:
   4174 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
   4175 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4176 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   4177 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4178 ;
   4179 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
   4180 ; SKX:       # %bb.0:
   4181 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
   4182 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4183 ; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   4184 ; SKX-NEXT:    retq # sched: [7:1.00]
   4185   %vec = load <8 x double>, <8 x double>* %vp
   4186   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   4187   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4188   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4189   ret <8 x double> %res
   4190 }
   4191 
   4192 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) {
   4193 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
   4194 ; GENERIC:       # %bb.0:
   4195 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
   4196 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   4197 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   4198 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4199 ;
   4200 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
   4201 ; SKX:       # %bb.0:
   4202 ; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
   4203 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   4204 ; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   4205 ; SKX-NEXT:    retq # sched: [7:1.00]
   4206   %vec = load <8 x double>, <8 x double>* %vp
   4207   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   4208   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4209   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4210   ret <8 x double> %res
   4211 }
   4212 
   4213 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
   4214 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
   4215 ; GENERIC:       # %bb.0:
   4216 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4217 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
   4218 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4219 ;
   4220 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
   4221 ; SKX:       # %bb.0:
   4222 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4223 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
   4224 ; SKX-NEXT:    retq # sched: [7:1.00]
   4225   %vec = load <8 x double>, <8 x double>* %vp
   4226   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
   4227   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4228   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4229   ret <8 x double> %res
   4230 }
   4231 
   4232 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) {
   4233 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
   4234 ; GENERIC:       # %bb.0:
   4235 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   4236 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
   4237 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4238 ;
   4239 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
   4240 ; SKX:       # %bb.0:
   4241 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   4242 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
   4243 ; SKX-NEXT:    retq # sched: [7:1.00]
   4244   %vec = load <8 x double>, <8 x double>* %vp
   4245   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
   4246   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4247   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4248   ret <8 x double> %res
   4249 }
   4250 
   4251 define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
   4252 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
   4253 ; GENERIC:       # %bb.0:
   4254 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [7:0.50]
   4255 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4256 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   4257 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4258 ;
   4259 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
   4260 ; SKX:       # %bb.0:
   4261 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
   4262 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4263 ; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   4264 ; SKX-NEXT:    retq # sched: [7:1.00]
   4265   %vec = load <8 x double>, <8 x double>* %vp
   4266   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
   4267   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4268   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4269   ret <8 x double> %res
   4270 }
   4271 
   4272 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) {
   4273 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
   4274 ; GENERIC:       # %bb.0:
   4275 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [7:0.50]
   4276 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   4277 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   4278 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4279 ;
   4280 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
   4281 ; SKX:       # %bb.0:
   4282 ; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
   4283 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   4284 ; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   4285 ; SKX-NEXT:    retq # sched: [7:1.00]
   4286   %vec = load <8 x double>, <8 x double>* %vp
   4287   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
   4288   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4289   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4290   ret <8 x double> %res
   4291 }
   4292 
   4293 define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
   4294 ; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3:
   4295 ; GENERIC:       # %bb.0:
   4296 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
   4297 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4298 ;
   4299 ; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3:
   4300 ; SKX:       # %bb.0:
   4301 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
   4302 ; SKX-NEXT:    retq # sched: [7:1.00]
   4303   %vec = load <8 x double>, <8 x double>* %vp
   4304   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   4305   ret <8 x double> %res
   4306 }
   4307 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
   4308 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
   4309 ; GENERIC:       # %bb.0:
   4310 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4311 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
   4312 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4313 ;
   4314 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
   4315 ; SKX:       # %bb.0:
   4316 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4317 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
   4318 ; SKX-NEXT:    retq # sched: [7:1.00]
   4319   %vec = load <8 x double>, <8 x double>* %vp
   4320   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   4321   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4322   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4323   ret <8 x double> %res
   4324 }
   4325 
   4326 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) {
   4327 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
   4328 ; GENERIC:       # %bb.0:
   4329 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   4330 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
   4331 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4332 ;
   4333 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
   4334 ; SKX:       # %bb.0:
   4335 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   4336 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
   4337 ; SKX-NEXT:    retq # sched: [7:1.00]
   4338   %vec = load <8 x double>, <8 x double>* %vp
   4339   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   4340   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4341   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4342   ret <8 x double> %res
   4343 }
   4344 
   4345 define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
   4346 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
   4347 ; GENERIC:       # %bb.0:
   4348 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [7:0.50]
   4349 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4350 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   4351 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4352 ;
   4353 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
   4354 ; SKX:       # %bb.0:
   4355 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
   4356 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4357 ; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   4358 ; SKX-NEXT:    retq # sched: [7:1.00]
   4359   %vec = load <8 x double>, <8 x double>* %vp
   4360   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
   4361   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4362   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4363   ret <8 x double> %res
   4364 }
   4365 
   4366 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) {
   4367 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
   4368 ; GENERIC:       # %bb.0:
   4369 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [7:0.50]
   4370 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   4371 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   4372 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4373 ;
   4374 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
   4375 ; SKX:       # %bb.0:
   4376 ; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
   4377 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   4378 ; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   4379 ; SKX-NEXT:    retq # sched: [7:1.00]
   4380   %vec = load <8 x double>, <8 x double>* %vp
   4381   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
   4382   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4383   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4384   ret <8 x double> %res
   4385 }
   4386 
   4387 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
   4388 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
   4389 ; GENERIC:       # %bb.0:
   4390 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4391 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
   4392 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4393 ;
   4394 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
   4395 ; SKX:       # %bb.0:
   4396 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4397 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
   4398 ; SKX-NEXT:    retq # sched: [7:1.00]
   4399   %vec = load <8 x double>, <8 x double>* %vp
   4400   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
   4401   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4402   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4403   ret <8 x double> %res
   4404 }
   4405 
   4406 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) {
   4407 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
   4408 ; GENERIC:       # %bb.0:
   4409 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   4410 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
   4411 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4412 ;
   4413 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
   4414 ; SKX:       # %bb.0:
   4415 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   4416 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
   4417 ; SKX-NEXT:    retq # sched: [7:1.00]
   4418   %vec = load <8 x double>, <8 x double>* %vp
   4419   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
   4420   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4421   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4422   ret <8 x double> %res
   4423 }
   4424 
   4425 define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
   4426 ; GENERIC-LABEL: test_8xdouble_perm_mem_mask6:
   4427 ; GENERIC:       # %bb.0:
   4428 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
   4429 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
   4430 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4431 ;
   4432 ; SKX-LABEL: test_8xdouble_perm_mem_mask6:
   4433 ; SKX:       # %bb.0:
   4434 ; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
   4435 ; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
   4436 ; SKX-NEXT:    retq # sched: [7:1.00]
   4437   %vec = load <8 x double>, <8 x double>* %vp
   4438   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   4439   ret <8 x double> %res
   4440 }
   4441 define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
   4442 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
   4443 ; GENERIC:       # %bb.0:
   4444 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
   4445 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4446 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
   4447 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4448 ;
   4449 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
   4450 ; SKX:       # %bb.0:
   4451 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
   4452 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4453 ; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
   4454 ; SKX-NEXT:    retq # sched: [7:1.00]
   4455   %vec = load <8 x double>, <8 x double>* %vp
   4456   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   4457   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4458   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4459   ret <8 x double> %res
   4460 }
   4461 
   4462 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) {
   4463 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
   4464 ; GENERIC:       # %bb.0:
   4465 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
   4466 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   4467 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
   4468 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4469 ;
   4470 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
   4471 ; SKX:       # %bb.0:
   4472 ; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
   4473 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   4474 ; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
   4475 ; SKX-NEXT:    retq # sched: [7:1.00]
   4476   %vec = load <8 x double>, <8 x double>* %vp
   4477   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   4478   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4479   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4480   ret <8 x double> %res
   4481 }
   4482 
   4483 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
   4484 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
   4485 ; GENERIC:       # %bb.0:
   4486 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   4487 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
   4488 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4489 ;
   4490 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
   4491 ; SKX:       # %bb.0:
   4492 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   4493 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
   4494 ; SKX-NEXT:    retq # sched: [7:1.00]
   4495   %vec = load <8 x double>, <8 x double>* %vp
   4496   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   4497   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4498   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   4499   ret <8 x double> %res
   4500 }
   4501 
   4502 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) {
   4503 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
   4504 ; GENERIC:       # %bb.0:
   4505 ; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
   4506 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
   4507 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4508 ;
   4509 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
   4510 ; SKX:       # %bb.0:
   4511 ; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
   4512 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
   4513 ; SKX-NEXT:    retq # sched: [7:1.00]
   4514   %vec = load <8 x double>, <8 x double>* %vp
   4515   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   4516   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   4517   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   4518   ret <8 x double> %res
   4519 }
   4520 
   4521 define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
   4522 ; GENERIC-LABEL: test_16xi8_perm_mask0:
   4523 ; GENERIC:       # %bb.0:
   4524 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
   4525 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4526 ;
   4527 ; SKX-LABEL: test_16xi8_perm_mask0:
   4528 ; SKX:       # %bb.0:
   4529 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
   4530 ; SKX-NEXT:    retq # sched: [7:1.00]
   4531   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   4532   ret <16 x i8> %res
   4533 }
   4534 define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
   4535 ; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
   4536 ; GENERIC:       # %bb.0:
   4537 ; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
   4538 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
   4539 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   4540 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4541 ;
   4542 ; SKX-LABEL: test_masked_16xi8_perm_mask0:
   4543 ; SKX:       # %bb.0:
   4544 ; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
   4545 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
   4546 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   4547 ; SKX-NEXT:    retq # sched: [7:1.00]
   4548   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   4549   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4550   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
   4551   ret <16 x i8> %res
   4552 }
   4553 
   4554 define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
   4555 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
   4556 ; GENERIC:       # %bb.0:
   4557 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
   4558 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
   4559 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4560 ;
   4561 ; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
   4562 ; SKX:       # %bb.0:
   4563 ; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
   4564 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
   4565 ; SKX-NEXT:    retq # sched: [7:1.00]
   4566   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   4567   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4568   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   4569   ret <16 x i8> %res
   4570 }
   4571 define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
   4572 ; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
   4573 ; GENERIC:       # %bb.0:
   4574 ; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
   4575 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
   4576 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   4577 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4578 ;
   4579 ; SKX-LABEL: test_masked_16xi8_perm_mask1:
   4580 ; SKX:       # %bb.0:
   4581 ; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
   4582 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
   4583 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   4584 ; SKX-NEXT:    retq # sched: [7:1.00]
   4585   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   4586   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4587   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
   4588   ret <16 x i8> %res
   4589 }
   4590 
   4591 define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
   4592 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
   4593 ; GENERIC:       # %bb.0:
   4594 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
   4595 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
   4596 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4597 ;
   4598 ; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
   4599 ; SKX:       # %bb.0:
   4600 ; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
   4601 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
   4602 ; SKX-NEXT:    retq # sched: [7:1.00]
   4603   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   4604   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4605   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   4606   ret <16 x i8> %res
   4607 }
   4608 define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
   4609 ; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
   4610 ; GENERIC:       # %bb.0:
   4611 ; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
   4612 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
   4613 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   4614 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4615 ;
   4616 ; SKX-LABEL: test_masked_16xi8_perm_mask2:
   4617 ; SKX:       # %bb.0:
   4618 ; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
   4619 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
   4620 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   4621 ; SKX-NEXT:    retq # sched: [7:1.00]
   4622   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   4623   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4624   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
   4625   ret <16 x i8> %res
   4626 }
   4627 
   4628 define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
   4629 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
   4630 ; GENERIC:       # %bb.0:
   4631 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
   4632 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
   4633 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4634 ;
   4635 ; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
   4636 ; SKX:       # %bb.0:
   4637 ; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
   4638 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
   4639 ; SKX-NEXT:    retq # sched: [7:1.00]
   4640   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   4641   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4642   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   4643   ret <16 x i8> %res
   4644 }
   4645 define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
   4646 ; GENERIC-LABEL: test_16xi8_perm_mask3:
   4647 ; GENERIC:       # %bb.0:
   4648 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
   4649 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4650 ;
   4651 ; SKX-LABEL: test_16xi8_perm_mask3:
   4652 ; SKX:       # %bb.0:
   4653 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
   4654 ; SKX-NEXT:    retq # sched: [7:1.00]
   4655   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   4656   ret <16 x i8> %res
   4657 }
   4658 define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
   4659 ; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
   4660 ; GENERIC:       # %bb.0:
   4661 ; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
   4662 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
   4663 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   4664 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4665 ;
   4666 ; SKX-LABEL: test_masked_16xi8_perm_mask3:
   4667 ; SKX:       # %bb.0:
   4668 ; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
   4669 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
   4670 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   4671 ; SKX-NEXT:    retq # sched: [7:1.00]
   4672   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   4673   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4674   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
   4675   ret <16 x i8> %res
   4676 }
   4677 
   4678 define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
   4679 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
   4680 ; GENERIC:       # %bb.0:
   4681 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
   4682 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
   4683 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4684 ;
   4685 ; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
   4686 ; SKX:       # %bb.0:
   4687 ; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
   4688 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
   4689 ; SKX-NEXT:    retq # sched: [7:1.00]
   4690   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   4691   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4692   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   4693   ret <16 x i8> %res
   4694 }
   4695 define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
   4696 ; GENERIC-LABEL: test_16xi8_perm_mem_mask0:
   4697 ; GENERIC:       # %bb.0:
   4698 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
   4699 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
   4700 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4701 ;
   4702 ; SKX-LABEL: test_16xi8_perm_mem_mask0:
   4703 ; SKX:       # %bb.0:
   4704 ; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
   4705 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
   4706 ; SKX-NEXT:    retq # sched: [7:1.00]
   4707   %vec = load <16 x i8>, <16 x i8>* %vp
   4708   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   4709   ret <16 x i8> %res
   4710 }
   4711 define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
   4712 ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0:
   4713 ; GENERIC:       # %bb.0:
   4714 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
   4715 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
   4716 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
   4717 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4718 ;
   4719 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
   4720 ; SKX:       # %bb.0:
   4721 ; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
   4722 ; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
   4723 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
   4724 ; SKX-NEXT:    retq # sched: [7:1.00]
   4725   %vec = load <16 x i8>, <16 x i8>* %vp
   4726   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   4727   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4728   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
   4729   ret <16 x i8> %res
   4730 }
   4731 
   4732 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) {
   4733 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0:
   4734 ; GENERIC:       # %bb.0:
   4735 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
   4736 ; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
   4737 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
   4738 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4739 ;
   4740 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
   4741 ; SKX:       # %bb.0:
   4742 ; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
   4743 ; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
   4744 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
   4745 ; SKX-NEXT:    retq # sched: [7:1.00]
   4746   %vec = load <16 x i8>, <16 x i8>* %vp
   4747   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   4748   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4749   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   4750   ret <16 x i8> %res
   4751 }
   4752 
   4753 define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
   4754 ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1:
   4755 ; GENERIC:       # %bb.0:
   4756 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
   4757 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
   4758 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
   4759 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4760 ;
   4761 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
   4762 ; SKX:       # %bb.0:
   4763 ; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
   4764 ; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
   4765 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
   4766 ; SKX-NEXT:    retq # sched: [7:1.00]
   4767   %vec = load <16 x i8>, <16 x i8>* %vp
   4768   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   4769   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4770   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
   4771   ret <16 x i8> %res
   4772 }
   4773 
   4774 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) {
   4775 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1:
   4776 ; GENERIC:       # %bb.0:
   4777 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
   4778 ; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
   4779 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
   4780 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4781 ;
   4782 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
   4783 ; SKX:       # %bb.0:
   4784 ; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
   4785 ; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
   4786 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
   4787 ; SKX-NEXT:    retq # sched: [7:1.00]
   4788   %vec = load <16 x i8>, <16 x i8>* %vp
   4789   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   4790   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4791   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   4792   ret <16 x i8> %res
   4793 }
   4794 
   4795 define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
   4796 ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2:
   4797 ; GENERIC:       # %bb.0:
   4798 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
   4799 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
   4800 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
   4801 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4802 ;
   4803 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
   4804 ; SKX:       # %bb.0:
   4805 ; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
   4806 ; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
   4807 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
   4808 ; SKX-NEXT:    retq # sched: [7:1.00]
   4809   %vec = load <16 x i8>, <16 x i8>* %vp
   4810   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   4811   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4812   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
   4813   ret <16 x i8> %res
   4814 }
   4815 
   4816 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) {
   4817 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2:
   4818 ; GENERIC:       # %bb.0:
   4819 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
   4820 ; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
   4821 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
   4822 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4823 ;
   4824 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
   4825 ; SKX:       # %bb.0:
   4826 ; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
   4827 ; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
   4828 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
   4829 ; SKX-NEXT:    retq # sched: [7:1.00]
   4830   %vec = load <16 x i8>, <16 x i8>* %vp
   4831   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   4832   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4833   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   4834   ret <16 x i8> %res
   4835 }
   4836 
   4837 define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
   4838 ; GENERIC-LABEL: test_16xi8_perm_mem_mask3:
   4839 ; GENERIC:       # %bb.0:
   4840 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
   4841 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
   4842 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4843 ;
   4844 ; SKX-LABEL: test_16xi8_perm_mem_mask3:
   4845 ; SKX:       # %bb.0:
   4846 ; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
   4847 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
   4848 ; SKX-NEXT:    retq # sched: [7:1.00]
   4849   %vec = load <16 x i8>, <16 x i8>* %vp
   4850   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   4851   ret <16 x i8> %res
   4852 }
   4853 define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
   4854 ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3:
   4855 ; GENERIC:       # %bb.0:
   4856 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
   4857 ; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
   4858 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
   4859 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4860 ;
   4861 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
   4862 ; SKX:       # %bb.0:
   4863 ; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
   4864 ; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
   4865 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
   4866 ; SKX-NEXT:    retq # sched: [7:1.00]
   4867   %vec = load <16 x i8>, <16 x i8>* %vp
   4868   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   4869   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4870   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
   4871   ret <16 x i8> %res
   4872 }
   4873 
   4874 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) {
   4875 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3:
   4876 ; GENERIC:       # %bb.0:
   4877 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
   4878 ; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
   4879 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
   4880 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4881 ;
   4882 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
   4883 ; SKX:       # %bb.0:
   4884 ; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
   4885 ; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
   4886 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
   4887 ; SKX-NEXT:    retq # sched: [7:1.00]
   4888   %vec = load <16 x i8>, <16 x i8>* %vp
   4889   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   4890   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   4891   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   4892   ret <16 x i8> %res
   4893 }
   4894 
   4895 define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
   4896 ; GENERIC-LABEL: test_32xi8_perm_mask0:
   4897 ; GENERIC:       # %bb.0:
   4898 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
   4899 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4900 ;
   4901 ; SKX-LABEL: test_32xi8_perm_mask0:
   4902 ; SKX:       # %bb.0:
   4903 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
   4904 ; SKX-NEXT:    retq # sched: [7:1.00]
   4905   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   4906   ret <32 x i8> %res
   4907 }
   4908 define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
   4909 ; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
   4910 ; GENERIC:       # %bb.0:
   4911 ; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
   4912 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
   4913 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   4914 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4915 ;
   4916 ; SKX-LABEL: test_masked_32xi8_perm_mask0:
   4917 ; SKX:       # %bb.0:
   4918 ; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
   4919 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
   4920 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   4921 ; SKX-NEXT:    retq # sched: [7:1.00]
   4922   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   4923   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   4924   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
   4925   ret <32 x i8> %res
   4926 }
   4927 
   4928 define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
   4929 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
   4930 ; GENERIC:       # %bb.0:
   4931 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
   4932 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
   4933 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4934 ;
   4935 ; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
   4936 ; SKX:       # %bb.0:
   4937 ; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
   4938 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
   4939 ; SKX-NEXT:    retq # sched: [7:1.00]
   4940   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   4941   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   4942   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   4943   ret <32 x i8> %res
   4944 }
   4945 define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
   4946 ; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
   4947 ; GENERIC:       # %bb.0:
   4948 ; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
   4949 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
   4950 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   4951 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4952 ;
   4953 ; SKX-LABEL: test_masked_32xi8_perm_mask1:
   4954 ; SKX:       # %bb.0:
   4955 ; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
   4956 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
   4957 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   4958 ; SKX-NEXT:    retq # sched: [7:1.00]
   4959   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
   4960   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   4961   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
   4962   ret <32 x i8> %res
   4963 }
   4964 
   4965 define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
   4966 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
   4967 ; GENERIC:       # %bb.0:
   4968 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
   4969 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
   4970 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4971 ;
   4972 ; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
   4973 ; SKX:       # %bb.0:
   4974 ; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
   4975 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
   4976 ; SKX-NEXT:    retq # sched: [7:1.00]
   4977   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
   4978   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   4979   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   4980   ret <32 x i8> %res
   4981 }
   4982 define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
   4983 ; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
   4984 ; GENERIC:       # %bb.0:
   4985 ; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
   4986 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
   4987 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   4988 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4989 ;
   4990 ; SKX-LABEL: test_masked_32xi8_perm_mask2:
   4991 ; SKX:       # %bb.0:
   4992 ; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
   4993 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
   4994 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   4995 ; SKX-NEXT:    retq # sched: [7:1.00]
   4996   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
   4997   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   4998   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
   4999   ret <32 x i8> %res
   5000 }
   5001 
   5002 define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
   5003 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
   5004 ; GENERIC:       # %bb.0:
   5005 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
   5006 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
   5007 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5008 ;
   5009 ; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
   5010 ; SKX:       # %bb.0:
   5011 ; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
   5012 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
   5013 ; SKX-NEXT:    retq # sched: [7:1.00]
   5014   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
   5015   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5016   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   5017   ret <32 x i8> %res
   5018 }
   5019 define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
   5020 ; GENERIC-LABEL: test_32xi8_perm_mask3:
   5021 ; GENERIC:       # %bb.0:
   5022 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
   5023 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5024 ;
   5025 ; SKX-LABEL: test_32xi8_perm_mask3:
   5026 ; SKX:       # %bb.0:
   5027 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
   5028 ; SKX-NEXT:    retq # sched: [7:1.00]
   5029   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   5030   ret <32 x i8> %res
   5031 }
   5032 define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
   5033 ; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
   5034 ; GENERIC:       # %bb.0:
   5035 ; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
   5036 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
   5037 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   5038 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5039 ;
   5040 ; SKX-LABEL: test_masked_32xi8_perm_mask3:
   5041 ; SKX:       # %bb.0:
   5042 ; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
   5043 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
   5044 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   5045 ; SKX-NEXT:    retq # sched: [7:1.00]
   5046   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   5047   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5048   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
   5049   ret <32 x i8> %res
   5050 }
   5051 
   5052 define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
   5053 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
   5054 ; GENERIC:       # %bb.0:
   5055 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
   5056 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
   5057 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5058 ;
   5059 ; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
   5060 ; SKX:       # %bb.0:
   5061 ; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
   5062 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
   5063 ; SKX-NEXT:    retq # sched: [7:1.00]
   5064   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   5065   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5066   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   5067   ret <32 x i8> %res
   5068 }
   5069 define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
   5070 ; GENERIC-LABEL: test_32xi8_perm_mem_mask0:
   5071 ; GENERIC:       # %bb.0:
   5072 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
   5073 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
   5074 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5075 ;
   5076 ; SKX-LABEL: test_32xi8_perm_mem_mask0:
   5077 ; SKX:       # %bb.0:
   5078 ; SKX-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
   5079 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
   5080 ; SKX-NEXT:    retq # sched: [7:1.00]
   5081   %vec = load <32 x i8>, <32 x i8>* %vp
   5082   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   5083   ret <32 x i8> %res
   5084 }
   5085 define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
   5086 ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0:
   5087 ; GENERIC:       # %bb.0:
   5088 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
   5089 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
   5090 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
   5091 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5092 ;
   5093 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
   5094 ; SKX:       # %bb.0:
   5095 ; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
   5096 ; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
   5097 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
   5098 ; SKX-NEXT:    retq # sched: [7:1.00]
   5099   %vec = load <32 x i8>, <32 x i8>* %vp
   5100   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   5101   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5102   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
   5103   ret <32 x i8> %res
   5104 }
   5105 
   5106 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) {
   5107 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0:
   5108 ; GENERIC:       # %bb.0:
   5109 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
   5110 ; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
   5111 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
   5112 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5113 ;
   5114 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
   5115 ; SKX:       # %bb.0:
   5116 ; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
   5117 ; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
   5118 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
   5119 ; SKX-NEXT:    retq # sched: [7:1.00]
   5120   %vec = load <32 x i8>, <32 x i8>* %vp
   5121   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   5122   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5123   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   5124   ret <32 x i8> %res
   5125 }
   5126 
   5127 define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
   5128 ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1:
   5129 ; GENERIC:       # %bb.0:
   5130 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
   5131 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
   5132 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
   5133 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5134 ;
   5135 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
   5136 ; SKX:       # %bb.0:
   5137 ; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
   5138 ; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
   5139 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
   5140 ; SKX-NEXT:    retq # sched: [7:1.00]
   5141   %vec = load <32 x i8>, <32 x i8>* %vp
   5142   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
   5143   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5144   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
   5145   ret <32 x i8> %res
   5146 }
   5147 
   5148 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) {
   5149 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1:
   5150 ; GENERIC:       # %bb.0:
   5151 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
   5152 ; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
   5153 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
   5154 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5155 ;
   5156 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
   5157 ; SKX:       # %bb.0:
   5158 ; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
   5159 ; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
   5160 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
   5161 ; SKX-NEXT:    retq # sched: [7:1.00]
   5162   %vec = load <32 x i8>, <32 x i8>* %vp
   5163   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
   5164   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5165   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   5166   ret <32 x i8> %res
   5167 }
   5168 
   5169 define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
   5170 ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2:
   5171 ; GENERIC:       # %bb.0:
   5172 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
   5173 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
   5174 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
   5175 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5176 ;
   5177 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
   5178 ; SKX:       # %bb.0:
   5179 ; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
   5180 ; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
   5181 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
   5182 ; SKX-NEXT:    retq # sched: [7:1.00]
   5183   %vec = load <32 x i8>, <32 x i8>* %vp
   5184   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
   5185   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5186   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
   5187   ret <32 x i8> %res
   5188 }
   5189 
   5190 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) {
   5191 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2:
   5192 ; GENERIC:       # %bb.0:
   5193 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
   5194 ; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
   5195 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
   5196 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5197 ;
   5198 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
   5199 ; SKX:       # %bb.0:
   5200 ; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
   5201 ; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
   5202 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
   5203 ; SKX-NEXT:    retq # sched: [7:1.00]
   5204   %vec = load <32 x i8>, <32 x i8>* %vp
   5205   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
   5206   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5207   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   5208   ret <32 x i8> %res
   5209 }
   5210 
   5211 define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
   5212 ; GENERIC-LABEL: test_32xi8_perm_mem_mask3:
   5213 ; GENERIC:       # %bb.0:
   5214 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
   5215 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
   5216 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5217 ;
   5218 ; SKX-LABEL: test_32xi8_perm_mem_mask3:
   5219 ; SKX:       # %bb.0:
   5220 ; SKX-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
   5221 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
   5222 ; SKX-NEXT:    retq # sched: [7:1.00]
   5223   %vec = load <32 x i8>, <32 x i8>* %vp
   5224   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   5225   ret <32 x i8> %res
   5226 }
   5227 define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
   5228 ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3:
   5229 ; GENERIC:       # %bb.0:
   5230 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
   5231 ; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
   5232 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
   5233 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5234 ;
   5235 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
   5236 ; SKX:       # %bb.0:
   5237 ; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
   5238 ; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
   5239 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
   5240 ; SKX-NEXT:    retq # sched: [7:1.00]
   5241   %vec = load <32 x i8>, <32 x i8>* %vp
   5242   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   5243   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5244   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
   5245   ret <32 x i8> %res
   5246 }
   5247 
   5248 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) {
   5249 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3:
   5250 ; GENERIC:       # %bb.0:
   5251 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
   5252 ; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
   5253 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
   5254 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5255 ;
   5256 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
   5257 ; SKX:       # %bb.0:
   5258 ; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
   5259 ; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
   5260 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
   5261 ; SKX-NEXT:    retq # sched: [7:1.00]
   5262   %vec = load <32 x i8>, <32 x i8>* %vp
   5263   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   5264   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   5265   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   5266   ret <32 x i8> %res
   5267 }
   5268 
   5269 define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
   5270 ; GENERIC-LABEL: test_64xi8_perm_mask0:
   5271 ; GENERIC:       # %bb.0:
   5272 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
   5273 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5274 ;
   5275 ; SKX-LABEL: test_64xi8_perm_mask0:
   5276 ; SKX:       # %bb.0:
   5277 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
   5278 ; SKX-NEXT:    retq # sched: [7:1.00]
   5279   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   5280   ret <64 x i8> %res
   5281 }
   5282 define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
   5283 ; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
   5284 ; GENERIC:       # %bb.0:
   5285 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
   5286 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
   5287 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   5288 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5289 ;
   5290 ; SKX-LABEL: test_masked_64xi8_perm_mask0:
   5291 ; SKX:       # %bb.0:
   5292 ; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
   5293 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
   5294 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   5295 ; SKX-NEXT:    retq # sched: [7:1.00]
   5296   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   5297   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5298   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
   5299   ret <64 x i8> %res
   5300 }
   5301 
   5302 define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
   5303 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
   5304 ; GENERIC:       # %bb.0:
   5305 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
   5306 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
   5307 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5308 ;
   5309 ; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
   5310 ; SKX:       # %bb.0:
   5311 ; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
   5312 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
   5313 ; SKX-NEXT:    retq # sched: [7:1.00]
   5314   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   5315   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5316   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   5317   ret <64 x i8> %res
   5318 }
   5319 define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
   5320 ; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
   5321 ; GENERIC:       # %bb.0:
   5322 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
   5323 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
   5324 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   5325 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5326 ;
   5327 ; SKX-LABEL: test_masked_64xi8_perm_mask1:
   5328 ; SKX:       # %bb.0:
   5329 ; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
   5330 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
   5331 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   5332 ; SKX-NEXT:    retq # sched: [7:1.00]
   5333   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
   5334   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5335   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
   5336   ret <64 x i8> %res
   5337 }
   5338 
   5339 define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
   5340 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
   5341 ; GENERIC:       # %bb.0:
   5342 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
   5343 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
   5344 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5345 ;
   5346 ; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
   5347 ; SKX:       # %bb.0:
   5348 ; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
   5349 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
   5350 ; SKX-NEXT:    retq # sched: [7:1.00]
   5351   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
   5352   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5353   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   5354   ret <64 x i8> %res
   5355 }
   5356 define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
   5357 ; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
   5358 ; GENERIC:       # %bb.0:
   5359 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
   5360 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
   5361 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   5362 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5363 ;
   5364 ; SKX-LABEL: test_masked_64xi8_perm_mask2:
   5365 ; SKX:       # %bb.0:
   5366 ; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
   5367 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
   5368 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   5369 ; SKX-NEXT:    retq # sched: [7:1.00]
   5370   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
   5371   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5372   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
   5373   ret <64 x i8> %res
   5374 }
   5375 
   5376 define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
   5377 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
   5378 ; GENERIC:       # %bb.0:
   5379 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
   5380 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
   5381 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5382 ;
   5383 ; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
   5384 ; SKX:       # %bb.0:
   5385 ; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
   5386 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
   5387 ; SKX-NEXT:    retq # sched: [7:1.00]
   5388   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
   5389   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5390   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   5391   ret <64 x i8> %res
   5392 }
   5393 define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
   5394 ; GENERIC-LABEL: test_64xi8_perm_mask3:
   5395 ; GENERIC:       # %bb.0:
   5396 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
   5397 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5398 ;
   5399 ; SKX-LABEL: test_64xi8_perm_mask3:
   5400 ; SKX:       # %bb.0:
   5401 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
   5402 ; SKX-NEXT:    retq # sched: [7:1.00]
   5403   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   5404   ret <64 x i8> %res
   5405 }
   5406 define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
   5407 ; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
   5408 ; GENERIC:       # %bb.0:
   5409 ; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
   5410 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
   5411 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   5412 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5413 ;
   5414 ; SKX-LABEL: test_masked_64xi8_perm_mask3:
   5415 ; SKX:       # %bb.0:
   5416 ; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
   5417 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
   5418 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   5419 ; SKX-NEXT:    retq # sched: [7:1.00]
   5420   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   5421   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5422   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
   5423   ret <64 x i8> %res
   5424 }
   5425 
   5426 define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
   5427 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
   5428 ; GENERIC:       # %bb.0:
   5429 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
   5430 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
   5431 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5432 ;
   5433 ; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
   5434 ; SKX:       # %bb.0:
   5435 ; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
   5436 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
   5437 ; SKX-NEXT:    retq # sched: [7:1.00]
   5438   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   5439   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5440   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   5441   ret <64 x i8> %res
   5442 }
   5443 define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
   5444 ; GENERIC-LABEL: test_64xi8_perm_mem_mask0:
   5445 ; GENERIC:       # %bb.0:
   5446 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50]
   5447 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
   5448 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5449 ;
   5450 ; SKX-LABEL: test_64xi8_perm_mem_mask0:
   5451 ; SKX:       # %bb.0:
   5452 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
   5453 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
   5454 ; SKX-NEXT:    retq # sched: [7:1.00]
   5455   %vec = load <64 x i8>, <64 x i8>* %vp
   5456   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   5457   ret <64 x i8> %res
   5458 }
   5459 define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
   5460 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
   5461 ; GENERIC:       # %bb.0:
   5462 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
   5463 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
   5464 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
   5465 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5466 ;
   5467 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
   5468 ; SKX:       # %bb.0:
   5469 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
   5470 ; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
   5471 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
   5472 ; SKX-NEXT:    retq # sched: [7:1.00]
   5473   %vec = load <64 x i8>, <64 x i8>* %vp
   5474   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   5475   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5476   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
   5477   ret <64 x i8> %res
   5478 }
   5479 
   5480 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
   5481 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
   5482 ; GENERIC:       # %bb.0:
   5483 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
   5484 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
   5485 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
   5486 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5487 ;
   5488 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
   5489 ; SKX:       # %bb.0:
   5490 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
   5491 ; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
   5492 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
   5493 ; SKX-NEXT:    retq # sched: [7:1.00]
   5494   %vec = load <64 x i8>, <64 x i8>* %vp
   5495   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   5496   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5497   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   5498   ret <64 x i8> %res
   5499 }
   5500 
   5501 define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
   5502 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
   5503 ; GENERIC:       # %bb.0:
   5504 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
   5505 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
   5506 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
   5507 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5508 ;
   5509 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
   5510 ; SKX:       # %bb.0:
   5511 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
   5512 ; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
   5513 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
   5514 ; SKX-NEXT:    retq # sched: [7:1.00]
   5515   %vec = load <64 x i8>, <64 x i8>* %vp
   5516   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
   5517   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5518   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
   5519   ret <64 x i8> %res
   5520 }
   5521 
   5522 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
   5523 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
   5524 ; GENERIC:       # %bb.0:
   5525 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
   5526 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
   5527 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
   5528 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5529 ;
   5530 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
   5531 ; SKX:       # %bb.0:
   5532 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
   5533 ; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
   5534 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
   5535 ; SKX-NEXT:    retq # sched: [7:1.00]
   5536   %vec = load <64 x i8>, <64 x i8>* %vp
   5537   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
   5538   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5539   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   5540   ret <64 x i8> %res
   5541 }
   5542 
   5543 define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
   5544 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
   5545 ; GENERIC:       # %bb.0:
   5546 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
   5547 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
   5548 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
   5549 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5550 ;
   5551 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
   5552 ; SKX:       # %bb.0:
   5553 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
   5554 ; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
   5555 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
   5556 ; SKX-NEXT:    retq # sched: [7:1.00]
   5557   %vec = load <64 x i8>, <64 x i8>* %vp
   5558   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
   5559   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5560   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
   5561   ret <64 x i8> %res
   5562 }
   5563 
   5564 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
   5565 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
   5566 ; GENERIC:       # %bb.0:
   5567 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
   5568 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
   5569 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
   5570 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5571 ;
   5572 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
   5573 ; SKX:       # %bb.0:
   5574 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
   5575 ; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
   5576 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
   5577 ; SKX-NEXT:    retq # sched: [7:1.00]
   5578   %vec = load <64 x i8>, <64 x i8>* %vp
   5579   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
   5580   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5581   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   5582   ret <64 x i8> %res
   5583 }
   5584 
   5585 define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
   5586 ; GENERIC-LABEL: test_64xi8_perm_mem_mask3:
   5587 ; GENERIC:       # %bb.0:
   5588 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50]
   5589 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
   5590 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5591 ;
   5592 ; SKX-LABEL: test_64xi8_perm_mem_mask3:
   5593 ; SKX:       # %bb.0:
   5594 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
   5595 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
   5596 ; SKX-NEXT:    retq # sched: [7:1.00]
   5597   %vec = load <64 x i8>, <64 x i8>* %vp
   5598   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   5599   ret <64 x i8> %res
   5600 }
   5601 define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
   5602 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
   5603 ; GENERIC:       # %bb.0:
   5604 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
   5605 ; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
   5606 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
   5607 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5608 ;
   5609 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
   5610 ; SKX:       # %bb.0:
   5611 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
   5612 ; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
   5613 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
   5614 ; SKX-NEXT:    retq # sched: [7:1.00]
   5615   %vec = load <64 x i8>, <64 x i8>* %vp
   5616   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   5617   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5618   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
   5619   ret <64 x i8> %res
   5620 }
   5621 
   5622 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
   5623 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
   5624 ; GENERIC:       # %bb.0:
   5625 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
   5626 ; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
   5627 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
   5628 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5629 ;
   5630 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
   5631 ; SKX:       # %bb.0:
   5632 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
   5633 ; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
   5634 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
   5635 ; SKX-NEXT:    retq # sched: [7:1.00]
   5636   %vec = load <64 x i8>, <64 x i8>* %vp
   5637   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   5638   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   5639   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   5640   ret <64 x i8> %res
   5641 }
   5642 
   5643 define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
   5644 ; GENERIC-LABEL: test_8xi16_perm_high_mask0:
   5645 ; GENERIC:       # %bb.0:
   5646 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
   5647 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5648 ;
   5649 ; SKX-LABEL: test_8xi16_perm_high_mask0:
   5650 ; SKX:       # %bb.0:
   5651 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
   5652 ; SKX-NEXT:    retq # sched: [7:1.00]
   5653   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   5654   ret <8 x i16> %res
   5655 }
   5656 define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
   5657 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0:
   5658 ; GENERIC:       # %bb.0:
   5659 ; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
   5660 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
   5661 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5662 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5663 ;
   5664 ; SKX-LABEL: test_masked_8xi16_perm_high_mask0:
   5665 ; SKX:       # %bb.0:
   5666 ; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
   5667 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
   5668 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5669 ; SKX-NEXT:    retq # sched: [7:1.00]
   5670   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   5671   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5672   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   5673   ret <8 x i16> %res
   5674 }
   5675 
   5676 define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
   5677 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0:
   5678 ; GENERIC:       # %bb.0:
   5679 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5680 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
   5681 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5682 ;
   5683 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0:
   5684 ; SKX:       # %bb.0:
   5685 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   5686 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
   5687 ; SKX-NEXT:    retq # sched: [7:1.00]
   5688   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   5689   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5690   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   5691   ret <8 x i16> %res
   5692 }
   5693 define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
   5694 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1:
   5695 ; GENERIC:       # %bb.0:
   5696 ; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
   5697 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50]
   5698 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5699 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5700 ;
   5701 ; SKX-LABEL: test_masked_8xi16_perm_low_mask1:
   5702 ; SKX:       # %bb.0:
   5703 ; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
   5704 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
   5705 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5706 ; SKX-NEXT:    retq # sched: [7:1.00]
   5707   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   5708   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5709   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   5710   ret <8 x i16> %res
   5711 }
   5712 
   5713 define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
   5714 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1:
   5715 ; GENERIC:       # %bb.0:
   5716 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5717 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50]
   5718 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5719 ;
   5720 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1:
   5721 ; SKX:       # %bb.0:
   5722 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   5723 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
   5724 ; SKX-NEXT:    retq # sched: [7:1.00]
   5725   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   5726   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5727   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   5728   ret <8 x i16> %res
   5729 }
   5730 define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
   5731 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2:
   5732 ; GENERIC:       # %bb.0:
   5733 ; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
   5734 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50]
   5735 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5736 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5737 ;
   5738 ; SKX-LABEL: test_masked_8xi16_perm_high_mask2:
   5739 ; SKX:       # %bb.0:
   5740 ; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
   5741 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
   5742 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5743 ; SKX-NEXT:    retq # sched: [7:1.00]
   5744   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
   5745   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5746   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   5747   ret <8 x i16> %res
   5748 }
   5749 
   5750 define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
   5751 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2:
   5752 ; GENERIC:       # %bb.0:
   5753 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5754 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50]
   5755 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5756 ;
   5757 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2:
   5758 ; SKX:       # %bb.0:
   5759 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   5760 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
   5761 ; SKX-NEXT:    retq # sched: [7:1.00]
   5762   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
   5763   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5764   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   5765   ret <8 x i16> %res
   5766 }
   5767 define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
   5768 ; GENERIC-LABEL: test_8xi16_perm_low_mask3:
   5769 ; GENERIC:       # %bb.0:
   5770 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
   5771 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5772 ;
   5773 ; SKX-LABEL: test_8xi16_perm_low_mask3:
   5774 ; SKX:       # %bb.0:
   5775 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
   5776 ; SKX-NEXT:    retq # sched: [7:1.00]
   5777   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   5778   ret <8 x i16> %res
   5779 }
   5780 define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
   5781 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3:
   5782 ; GENERIC:       # %bb.0:
   5783 ; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
   5784 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
   5785 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5786 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5787 ;
   5788 ; SKX-LABEL: test_masked_8xi16_perm_low_mask3:
   5789 ; SKX:       # %bb.0:
   5790 ; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
   5791 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
   5792 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5793 ; SKX-NEXT:    retq # sched: [7:1.00]
   5794   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   5795   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5796   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   5797   ret <8 x i16> %res
   5798 }
   5799 
   5800 define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
   5801 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3:
   5802 ; GENERIC:       # %bb.0:
   5803 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5804 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
   5805 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5806 ;
   5807 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3:
   5808 ; SKX:       # %bb.0:
   5809 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   5810 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
   5811 ; SKX-NEXT:    retq # sched: [7:1.00]
   5812   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   5813   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5814   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   5815   ret <8 x i16> %res
   5816 }
   5817 define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
   5818 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4:
   5819 ; GENERIC:       # %bb.0:
   5820 ; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
   5821 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50]
   5822 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5823 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5824 ;
   5825 ; SKX-LABEL: test_masked_8xi16_perm_high_mask4:
   5826 ; SKX:       # %bb.0:
   5827 ; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
   5828 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
   5829 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5830 ; SKX-NEXT:    retq # sched: [7:1.00]
   5831   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
   5832   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5833   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   5834   ret <8 x i16> %res
   5835 }
   5836 
   5837 define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
   5838 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4:
   5839 ; GENERIC:       # %bb.0:
   5840 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5841 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50]
   5842 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5843 ;
   5844 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4:
   5845 ; SKX:       # %bb.0:
   5846 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   5847 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
   5848 ; SKX-NEXT:    retq # sched: [7:1.00]
   5849   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
   5850   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5851   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   5852   ret <8 x i16> %res
   5853 }
   5854 define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
   5855 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5:
   5856 ; GENERIC:       # %bb.0:
   5857 ; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
   5858 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50]
   5859 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5860 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5861 ;
   5862 ; SKX-LABEL: test_masked_8xi16_perm_low_mask5:
   5863 ; SKX:       # %bb.0:
   5864 ; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
   5865 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
   5866 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5867 ; SKX-NEXT:    retq # sched: [7:1.00]
   5868   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
   5869   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5870   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   5871   ret <8 x i16> %res
   5872 }
   5873 
   5874 define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
   5875 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5:
   5876 ; GENERIC:       # %bb.0:
   5877 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5878 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50]
   5879 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5880 ;
   5881 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5:
   5882 ; SKX:       # %bb.0:
   5883 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   5884 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
   5885 ; SKX-NEXT:    retq # sched: [7:1.00]
   5886   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
   5887   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5888   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   5889   ret <8 x i16> %res
   5890 }
   5891 define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
   5892 ; GENERIC-LABEL: test_8xi16_perm_high_mask6:
   5893 ; GENERIC:       # %bb.0:
   5894 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
   5895 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5896 ;
   5897 ; SKX-LABEL: test_8xi16_perm_high_mask6:
   5898 ; SKX:       # %bb.0:
   5899 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
   5900 ; SKX-NEXT:    retq # sched: [7:1.00]
   5901   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   5902   ret <8 x i16> %res
   5903 }
   5904 define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
   5905 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6:
   5906 ; GENERIC:       # %bb.0:
   5907 ; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
   5908 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
   5909 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5910 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5911 ;
   5912 ; SKX-LABEL: test_masked_8xi16_perm_high_mask6:
   5913 ; SKX:       # %bb.0:
   5914 ; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
   5915 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
   5916 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5917 ; SKX-NEXT:    retq # sched: [7:1.00]
   5918   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   5919   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5920   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   5921   ret <8 x i16> %res
   5922 }
   5923 
   5924 define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
   5925 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6:
   5926 ; GENERIC:       # %bb.0:
   5927 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5928 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
   5929 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5930 ;
   5931 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6:
   5932 ; SKX:       # %bb.0:
   5933 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   5934 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
   5935 ; SKX-NEXT:    retq # sched: [7:1.00]
   5936   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   5937   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5938   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   5939   ret <8 x i16> %res
   5940 }
   5941 define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
   5942 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7:
   5943 ; GENERIC:       # %bb.0:
   5944 ; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
   5945 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50]
   5946 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5947 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5948 ;
   5949 ; SKX-LABEL: test_masked_8xi16_perm_low_mask7:
   5950 ; SKX:       # %bb.0:
   5951 ; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
   5952 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
   5953 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   5954 ; SKX-NEXT:    retq # sched: [7:1.00]
   5955   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   5956   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5957   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   5958   ret <8 x i16> %res
   5959 }
   5960 
   5961 define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
   5962 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7:
   5963 ; GENERIC:       # %bb.0:
   5964 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5965 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50]
   5966 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5967 ;
   5968 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7:
   5969 ; SKX:       # %bb.0:
   5970 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   5971 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
   5972 ; SKX-NEXT:    retq # sched: [7:1.00]
   5973   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   5974   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   5975   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   5976   ret <8 x i16> %res
   5977 }
   5978 define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
   5979 ; GENERIC-LABEL: test_8xi16_perm_high_mem_mask0:
   5980 ; GENERIC:       # %bb.0:
   5981 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
   5982 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5983 ;
   5984 ; SKX-LABEL: test_8xi16_perm_high_mem_mask0:
   5985 ; SKX:       # %bb.0:
   5986 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
   5987 ; SKX-NEXT:    retq # sched: [7:1.00]
   5988   %vec = load <8 x i16>, <8 x i16>* %vp
   5989   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   5990   ret <8 x i16> %res
   5991 }
   5992 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   5993 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0:
   5994 ; GENERIC:       # %bb.0:
   5995 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   5996 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
   5997 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5998 ;
   5999 ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0:
   6000 ; SKX:       # %bb.0:
   6001 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   6002 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
   6003 ; SKX-NEXT:    retq # sched: [7:1.00]
   6004   %vec = load <8 x i16>, <8 x i16>* %vp
   6005   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   6006   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6007   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   6008   ret <8 x i16> %res
   6009 }
   6010 
   6011 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
   6012 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
   6013 ; GENERIC:       # %bb.0:
   6014 ; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
   6015 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
   6016 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6017 ;
   6018 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
   6019 ; SKX:       # %bb.0:
   6020 ; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
   6021 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
   6022 ; SKX-NEXT:    retq # sched: [7:1.00]
   6023   %vec = load <8 x i16>, <8 x i16>* %vp
   6024   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   6025   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6026   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   6027   ret <8 x i16> %res
   6028 }
   6029 
   6030 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   6031 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1:
   6032 ; GENERIC:       # %bb.0:
   6033 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   6034 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50]
   6035 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6036 ;
   6037 ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1:
   6038 ; SKX:       # %bb.0:
   6039 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   6040 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
   6041 ; SKX-NEXT:    retq # sched: [7:1.00]
   6042   %vec = load <8 x i16>, <8 x i16>* %vp
   6043   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   6044   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6045   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   6046   ret <8 x i16> %res
   6047 }
   6048 
   6049 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
   6050 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
   6051 ; GENERIC:       # %bb.0:
   6052 ; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
   6053 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50]
   6054 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6055 ;
   6056 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
   6057 ; SKX:       # %bb.0:
   6058 ; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
   6059 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
   6060 ; SKX-NEXT:    retq # sched: [7:1.00]
   6061   %vec = load <8 x i16>, <8 x i16>* %vp
   6062   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   6063   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6064   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   6065   ret <8 x i16> %res
   6066 }
   6067 
   6068 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   6069 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2:
   6070 ; GENERIC:       # %bb.0:
   6071 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   6072 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50]
   6073 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6074 ;
   6075 ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2:
   6076 ; SKX:       # %bb.0:
   6077 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   6078 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
   6079 ; SKX-NEXT:    retq # sched: [7:1.00]
   6080   %vec = load <8 x i16>, <8 x i16>* %vp
   6081   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
   6082   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6083   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   6084   ret <8 x i16> %res
   6085 }
   6086 
   6087 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
   6088 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
   6089 ; GENERIC:       # %bb.0:
   6090 ; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
   6091 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50]
   6092 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6093 ;
   6094 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
   6095 ; SKX:       # %bb.0:
   6096 ; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
   6097 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
   6098 ; SKX-NEXT:    retq # sched: [7:1.00]
   6099   %vec = load <8 x i16>, <8 x i16>* %vp
   6100   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
   6101   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6102   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   6103   ret <8 x i16> %res
   6104 }
   6105 
   6106 define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
   6107 ; GENERIC-LABEL: test_8xi16_perm_low_mem_mask3:
   6108 ; GENERIC:       # %bb.0:
   6109 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
   6110 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6111 ;
   6112 ; SKX-LABEL: test_8xi16_perm_low_mem_mask3:
   6113 ; SKX:       # %bb.0:
   6114 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
   6115 ; SKX-NEXT:    retq # sched: [7:1.00]
   6116   %vec = load <8 x i16>, <8 x i16>* %vp
   6117   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   6118   ret <8 x i16> %res
   6119 }
   6120 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   6121 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3:
   6122 ; GENERIC:       # %bb.0:
   6123 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   6124 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
   6125 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6126 ;
   6127 ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3:
   6128 ; SKX:       # %bb.0:
   6129 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   6130 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
   6131 ; SKX-NEXT:    retq # sched: [7:1.00]
   6132   %vec = load <8 x i16>, <8 x i16>* %vp
   6133   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   6134   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6135   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   6136   ret <8 x i16> %res
   6137 }
   6138 
   6139 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
   6140 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
   6141 ; GENERIC:       # %bb.0:
   6142 ; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
   6143 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
   6144 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6145 ;
   6146 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
   6147 ; SKX:       # %bb.0:
   6148 ; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
   6149 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
   6150 ; SKX-NEXT:    retq # sched: [7:1.00]
   6151   %vec = load <8 x i16>, <8 x i16>* %vp
   6152   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   6153   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6154   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   6155   ret <8 x i16> %res
   6156 }
   6157 
   6158 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   6159 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4:
   6160 ; GENERIC:       # %bb.0:
   6161 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   6162 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50]
   6163 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6164 ;
   6165 ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4:
   6166 ; SKX:       # %bb.0:
   6167 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   6168 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
   6169 ; SKX-NEXT:    retq # sched: [7:1.00]
   6170   %vec = load <8 x i16>, <8 x i16>* %vp
   6171   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
   6172   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6173   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   6174   ret <8 x i16> %res
   6175 }
   6176 
   6177 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
   6178 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
   6179 ; GENERIC:       # %bb.0:
   6180 ; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
   6181 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50]
   6182 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6183 ;
   6184 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
   6185 ; SKX:       # %bb.0:
   6186 ; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
   6187 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
   6188 ; SKX-NEXT:    retq # sched: [7:1.00]
   6189   %vec = load <8 x i16>, <8 x i16>* %vp
   6190   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
   6191   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6192   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   6193   ret <8 x i16> %res
   6194 }
   6195 
   6196 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   6197 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5:
   6198 ; GENERIC:       # %bb.0:
   6199 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   6200 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50]
   6201 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6202 ;
   6203 ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5:
   6204 ; SKX:       # %bb.0:
   6205 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   6206 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
   6207 ; SKX-NEXT:    retq # sched: [7:1.00]
   6208   %vec = load <8 x i16>, <8 x i16>* %vp
   6209   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   6210   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6211   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   6212   ret <8 x i16> %res
   6213 }
   6214 
   6215 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
   6216 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
   6217 ; GENERIC:       # %bb.0:
   6218 ; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
   6219 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50]
   6220 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6221 ;
   6222 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
   6223 ; SKX:       # %bb.0:
   6224 ; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
   6225 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
   6226 ; SKX-NEXT:    retq # sched: [7:1.00]
   6227   %vec = load <8 x i16>, <8 x i16>* %vp
   6228   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   6229   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6230   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   6231   ret <8 x i16> %res
   6232 }
   6233 
   6234 define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
   6235 ; GENERIC-LABEL: test_8xi16_perm_high_mem_mask6:
   6236 ; GENERIC:       # %bb.0:
   6237 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
   6238 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6239 ;
   6240 ; SKX-LABEL: test_8xi16_perm_high_mem_mask6:
   6241 ; SKX:       # %bb.0:
   6242 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
   6243 ; SKX-NEXT:    retq # sched: [7:1.00]
   6244   %vec = load <8 x i16>, <8 x i16>* %vp
   6245   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   6246   ret <8 x i16> %res
   6247 }
   6248 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   6249 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6:
   6250 ; GENERIC:       # %bb.0:
   6251 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   6252 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
   6253 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6254 ;
   6255 ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6:
   6256 ; SKX:       # %bb.0:
   6257 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   6258 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
   6259 ; SKX-NEXT:    retq # sched: [7:1.00]
   6260   %vec = load <8 x i16>, <8 x i16>* %vp
   6261   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   6262   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6263   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   6264   ret <8 x i16> %res
   6265 }
   6266 
   6267 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
   6268 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
   6269 ; GENERIC:       # %bb.0:
   6270 ; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
   6271 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
   6272 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6273 ;
   6274 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
   6275 ; SKX:       # %bb.0:
   6276 ; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
   6277 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
   6278 ; SKX-NEXT:    retq # sched: [7:1.00]
   6279   %vec = load <8 x i16>, <8 x i16>* %vp
   6280   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   6281   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6282   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   6283   ret <8 x i16> %res
   6284 }
   6285 
   6286 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
   6287 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7:
   6288 ; GENERIC:       # %bb.0:
   6289 ; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
   6290 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50]
   6291 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6292 ;
   6293 ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7:
   6294 ; SKX:       # %bb.0:
   6295 ; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
   6296 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
   6297 ; SKX-NEXT:    retq # sched: [7:1.00]
   6298   %vec = load <8 x i16>, <8 x i16>* %vp
   6299   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
   6300   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6301   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   6302   ret <8 x i16> %res
   6303 }
   6304 
   6305 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
   6306 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
   6307 ; GENERIC:       # %bb.0:
   6308 ; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
   6309 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50]
   6310 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6311 ;
   6312 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
   6313 ; SKX:       # %bb.0:
   6314 ; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
   6315 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
   6316 ; SKX-NEXT:    retq # sched: [7:1.00]
   6317   %vec = load <8 x i16>, <8 x i16>* %vp
   6318   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
   6319   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   6320   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   6321   ret <8 x i16> %res
   6322 }
   6323 
   6324 define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
   6325 ; GENERIC-LABEL: test_16xi16_perm_high_mask0:
   6326 ; GENERIC:       # %bb.0:
   6327 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
   6328 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6329 ;
   6330 ; SKX-LABEL: test_16xi16_perm_high_mask0:
   6331 ; SKX:       # %bb.0:
   6332 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
   6333 ; SKX-NEXT:    retq # sched: [7:1.00]
   6334   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   6335   ret <16 x i16> %res
   6336 }
   6337 define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   6338 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0:
   6339 ; GENERIC:       # %bb.0:
   6340 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
   6341 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
   6342 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   6343 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6344 ;
   6345 ; SKX-LABEL: test_masked_16xi16_perm_high_mask0:
   6346 ; SKX:       # %bb.0:
   6347 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
   6348 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
   6349 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   6350 ; SKX-NEXT:    retq # sched: [7:1.00]
   6351   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   6352   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6353   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6354   ret <16 x i16> %res
   6355 }
   6356 
   6357 define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
   6358 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0:
   6359 ; GENERIC:       # %bb.0:
   6360 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6361 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
   6362 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6363 ;
   6364 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0:
   6365 ; SKX:       # %bb.0:
   6366 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6367 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
   6368 ; SKX-NEXT:    retq # sched: [7:1.00]
   6369   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   6370   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6371   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6372   ret <16 x i16> %res
   6373 }
   6374 define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   6375 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1:
   6376 ; GENERIC:       # %bb.0:
   6377 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
   6378 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
   6379 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   6380 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6381 ;
   6382 ; SKX-LABEL: test_masked_16xi16_perm_low_mask1:
   6383 ; SKX:       # %bb.0:
   6384 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
   6385 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
   6386 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   6387 ; SKX-NEXT:    retq # sched: [7:1.00]
   6388   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   6389   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6390   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6391   ret <16 x i16> %res
   6392 }
   6393 
   6394 define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
   6395 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1:
   6396 ; GENERIC:       # %bb.0:
   6397 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6398 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
   6399 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6400 ;
   6401 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1:
   6402 ; SKX:       # %bb.0:
   6403 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6404 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
   6405 ; SKX-NEXT:    retq # sched: [7:1.00]
   6406   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   6407   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6408   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6409   ret <16 x i16> %res
   6410 }
   6411 define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   6412 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2:
   6413 ; GENERIC:       # %bb.0:
   6414 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
   6415 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
   6416 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   6417 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6418 ;
   6419 ; SKX-LABEL: test_masked_16xi16_perm_high_mask2:
   6420 ; SKX:       # %bb.0:
   6421 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
   6422 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
   6423 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   6424 ; SKX-NEXT:    retq # sched: [7:1.00]
   6425   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   6426   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6427   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6428   ret <16 x i16> %res
   6429 }
   6430 
   6431 define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
   6432 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2:
   6433 ; GENERIC:       # %bb.0:
   6434 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6435 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
   6436 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6437 ;
   6438 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2:
   6439 ; SKX:       # %bb.0:
   6440 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6441 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
   6442 ; SKX-NEXT:    retq # sched: [7:1.00]
   6443   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   6444   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6445   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6446   ret <16 x i16> %res
   6447 }
   6448 define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
   6449 ; GENERIC-LABEL: test_16xi16_perm_low_mask3:
   6450 ; GENERIC:       # %bb.0:
   6451 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
   6452 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6453 ;
   6454 ; SKX-LABEL: test_16xi16_perm_low_mask3:
   6455 ; SKX:       # %bb.0:
   6456 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
   6457 ; SKX-NEXT:    retq # sched: [7:1.00]
   6458   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   6459   ret <16 x i16> %res
   6460 }
   6461 define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   6462 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3:
   6463 ; GENERIC:       # %bb.0:
   6464 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
   6465 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
   6466 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   6467 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6468 ;
   6469 ; SKX-LABEL: test_masked_16xi16_perm_low_mask3:
   6470 ; SKX:       # %bb.0:
   6471 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
   6472 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
   6473 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   6474 ; SKX-NEXT:    retq # sched: [7:1.00]
   6475   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   6476   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6477   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6478   ret <16 x i16> %res
   6479 }
   6480 
   6481 define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
   6482 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3:
   6483 ; GENERIC:       # %bb.0:
   6484 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6485 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
   6486 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6487 ;
   6488 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3:
   6489 ; SKX:       # %bb.0:
   6490 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6491 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
   6492 ; SKX-NEXT:    retq # sched: [7:1.00]
   6493   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   6494   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6495   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6496   ret <16 x i16> %res
   6497 }
   6498 define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   6499 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4:
   6500 ; GENERIC:       # %bb.0:
   6501 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
   6502 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
   6503 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   6504 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6505 ;
   6506 ; SKX-LABEL: test_masked_16xi16_perm_high_mask4:
   6507 ; SKX:       # %bb.0:
   6508 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
   6509 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
   6510 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   6511 ; SKX-NEXT:    retq # sched: [7:1.00]
   6512   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   6513   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6514   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6515   ret <16 x i16> %res
   6516 }
   6517 
   6518 define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
   6519 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4:
   6520 ; GENERIC:       # %bb.0:
   6521 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6522 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
   6523 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6524 ;
   6525 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4:
   6526 ; SKX:       # %bb.0:
   6527 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6528 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
   6529 ; SKX-NEXT:    retq # sched: [7:1.00]
   6530   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   6531   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6532   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6533   ret <16 x i16> %res
   6534 }
   6535 define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   6536 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5:
   6537 ; GENERIC:       # %bb.0:
   6538 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
   6539 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
   6540 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   6541 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6542 ;
   6543 ; SKX-LABEL: test_masked_16xi16_perm_low_mask5:
   6544 ; SKX:       # %bb.0:
   6545 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
   6546 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
   6547 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   6548 ; SKX-NEXT:    retq # sched: [7:1.00]
   6549   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   6550   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6551   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6552   ret <16 x i16> %res
   6553 }
   6554 
   6555 define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
   6556 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5:
   6557 ; GENERIC:       # %bb.0:
   6558 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6559 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
   6560 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6561 ;
   6562 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5:
   6563 ; SKX:       # %bb.0:
   6564 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6565 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
   6566 ; SKX-NEXT:    retq # sched: [7:1.00]
   6567   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   6568   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6569   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6570   ret <16 x i16> %res
   6571 }
   6572 define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
   6573 ; GENERIC-LABEL: test_16xi16_perm_high_mask6:
   6574 ; GENERIC:       # %bb.0:
   6575 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
   6576 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6577 ;
   6578 ; SKX-LABEL: test_16xi16_perm_high_mask6:
   6579 ; SKX:       # %bb.0:
   6580 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
   6581 ; SKX-NEXT:    retq # sched: [7:1.00]
   6582   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   6583   ret <16 x i16> %res
   6584 }
   6585 define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   6586 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6:
   6587 ; GENERIC:       # %bb.0:
   6588 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
   6589 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
   6590 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   6591 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6592 ;
   6593 ; SKX-LABEL: test_masked_16xi16_perm_high_mask6:
   6594 ; SKX:       # %bb.0:
   6595 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
   6596 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
   6597 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   6598 ; SKX-NEXT:    retq # sched: [7:1.00]
   6599   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   6600   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6601   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6602   ret <16 x i16> %res
   6603 }
   6604 
   6605 define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
   6606 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6:
   6607 ; GENERIC:       # %bb.0:
   6608 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6609 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
   6610 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6611 ;
   6612 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6:
   6613 ; SKX:       # %bb.0:
   6614 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6615 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
   6616 ; SKX-NEXT:    retq # sched: [7:1.00]
   6617   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   6618   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6619   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6620   ret <16 x i16> %res
   6621 }
   6622 define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
   6623 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7:
   6624 ; GENERIC:       # %bb.0:
   6625 ; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
   6626 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
   6627 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   6628 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6629 ;
   6630 ; SKX-LABEL: test_masked_16xi16_perm_low_mask7:
   6631 ; SKX:       # %bb.0:
   6632 ; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
   6633 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
   6634 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   6635 ; SKX-NEXT:    retq # sched: [7:1.00]
   6636   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   6637   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6638   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6639   ret <16 x i16> %res
   6640 }
   6641 
   6642 define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
   6643 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7:
   6644 ; GENERIC:       # %bb.0:
   6645 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6646 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
   6647 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6648 ;
   6649 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7:
   6650 ; SKX:       # %bb.0:
   6651 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6652 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
   6653 ; SKX-NEXT:    retq # sched: [7:1.00]
   6654   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   6655   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6656   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6657   ret <16 x i16> %res
   6658 }
   6659 define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
   6660 ; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0:
   6661 ; GENERIC:       # %bb.0:
   6662 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
   6663 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6664 ;
   6665 ; SKX-LABEL: test_16xi16_perm_high_mem_mask0:
   6666 ; SKX:       # %bb.0:
   6667 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
   6668 ; SKX-NEXT:    retq # sched: [7:1.00]
   6669   %vec = load <16 x i16>, <16 x i16>* %vp
   6670   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   6671   ret <16 x i16> %res
   6672 }
   6673 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   6674 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
   6675 ; GENERIC:       # %bb.0:
   6676 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6677 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
   6678 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6679 ;
   6680 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
   6681 ; SKX:       # %bb.0:
   6682 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6683 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
   6684 ; SKX-NEXT:    retq # sched: [7:1.00]
   6685   %vec = load <16 x i16>, <16 x i16>* %vp
   6686   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   6687   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6688   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6689   ret <16 x i16> %res
   6690 }
   6691 
   6692 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
   6693 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
   6694 ; GENERIC:       # %bb.0:
   6695 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
   6696 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
   6697 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6698 ;
   6699 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
   6700 ; SKX:       # %bb.0:
   6701 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
   6702 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
   6703 ; SKX-NEXT:    retq # sched: [7:1.00]
   6704   %vec = load <16 x i16>, <16 x i16>* %vp
   6705   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   6706   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6707   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6708   ret <16 x i16> %res
   6709 }
   6710 
   6711 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   6712 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
   6713 ; GENERIC:       # %bb.0:
   6714 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6715 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
   6716 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6717 ;
   6718 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
   6719 ; SKX:       # %bb.0:
   6720 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6721 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
   6722 ; SKX-NEXT:    retq # sched: [7:1.00]
   6723   %vec = load <16 x i16>, <16 x i16>* %vp
   6724   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   6725   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6726   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6727   ret <16 x i16> %res
   6728 }
   6729 
   6730 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
   6731 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
   6732 ; GENERIC:       # %bb.0:
   6733 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
   6734 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
   6735 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6736 ;
   6737 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
   6738 ; SKX:       # %bb.0:
   6739 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
   6740 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
   6741 ; SKX-NEXT:    retq # sched: [7:1.00]
   6742   %vec = load <16 x i16>, <16 x i16>* %vp
   6743   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   6744   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6745   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6746   ret <16 x i16> %res
   6747 }
   6748 
   6749 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   6750 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
   6751 ; GENERIC:       # %bb.0:
   6752 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6753 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
   6754 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6755 ;
   6756 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
   6757 ; SKX:       # %bb.0:
   6758 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6759 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
   6760 ; SKX-NEXT:    retq # sched: [7:1.00]
   6761   %vec = load <16 x i16>, <16 x i16>* %vp
   6762   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
   6763   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6764   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6765   ret <16 x i16> %res
   6766 }
   6767 
   6768 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
   6769 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
   6770 ; GENERIC:       # %bb.0:
   6771 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
   6772 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
   6773 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6774 ;
   6775 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
   6776 ; SKX:       # %bb.0:
   6777 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
   6778 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
   6779 ; SKX-NEXT:    retq # sched: [7:1.00]
   6780   %vec = load <16 x i16>, <16 x i16>* %vp
   6781   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
   6782   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6783   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6784   ret <16 x i16> %res
   6785 }
   6786 
   6787 define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
   6788 ; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3:
   6789 ; GENERIC:       # %bb.0:
   6790 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
   6791 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6792 ;
   6793 ; SKX-LABEL: test_16xi16_perm_low_mem_mask3:
   6794 ; SKX:       # %bb.0:
   6795 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
   6796 ; SKX-NEXT:    retq # sched: [7:1.00]
   6797   %vec = load <16 x i16>, <16 x i16>* %vp
   6798   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   6799   ret <16 x i16> %res
   6800 }
   6801 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   6802 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
   6803 ; GENERIC:       # %bb.0:
   6804 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6805 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
   6806 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6807 ;
   6808 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
   6809 ; SKX:       # %bb.0:
   6810 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6811 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
   6812 ; SKX-NEXT:    retq # sched: [7:1.00]
   6813   %vec = load <16 x i16>, <16 x i16>* %vp
   6814   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   6815   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6816   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6817   ret <16 x i16> %res
   6818 }
   6819 
   6820 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
   6821 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
   6822 ; GENERIC:       # %bb.0:
   6823 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
   6824 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
   6825 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6826 ;
   6827 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
   6828 ; SKX:       # %bb.0:
   6829 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
   6830 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
   6831 ; SKX-NEXT:    retq # sched: [7:1.00]
   6832   %vec = load <16 x i16>, <16 x i16>* %vp
   6833   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   6834   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6835   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6836   ret <16 x i16> %res
   6837 }
   6838 
   6839 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   6840 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
   6841 ; GENERIC:       # %bb.0:
   6842 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6843 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
   6844 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6845 ;
   6846 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
   6847 ; SKX:       # %bb.0:
   6848 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6849 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
   6850 ; SKX-NEXT:    retq # sched: [7:1.00]
   6851   %vec = load <16 x i16>, <16 x i16>* %vp
   6852   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
   6853   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6854   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6855   ret <16 x i16> %res
   6856 }
   6857 
   6858 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
   6859 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
   6860 ; GENERIC:       # %bb.0:
   6861 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
   6862 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
   6863 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6864 ;
   6865 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
   6866 ; SKX:       # %bb.0:
   6867 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
   6868 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
   6869 ; SKX-NEXT:    retq # sched: [7:1.00]
   6870   %vec = load <16 x i16>, <16 x i16>* %vp
   6871   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
   6872   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6873   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6874   ret <16 x i16> %res
   6875 }
   6876 
   6877 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   6878 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
   6879 ; GENERIC:       # %bb.0:
   6880 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6881 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
   6882 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6883 ;
   6884 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
   6885 ; SKX:       # %bb.0:
   6886 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6887 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
   6888 ; SKX-NEXT:    retq # sched: [7:1.00]
   6889   %vec = load <16 x i16>, <16 x i16>* %vp
   6890   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   6891   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6892   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6893   ret <16 x i16> %res
   6894 }
   6895 
   6896 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
   6897 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
   6898 ; GENERIC:       # %bb.0:
   6899 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
   6900 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
   6901 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6902 ;
   6903 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
   6904 ; SKX:       # %bb.0:
   6905 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
   6906 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
   6907 ; SKX-NEXT:    retq # sched: [7:1.00]
   6908   %vec = load <16 x i16>, <16 x i16>* %vp
   6909   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   6910   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6911   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6912   ret <16 x i16> %res
   6913 }
   6914 
   6915 define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
   6916 ; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6:
   6917 ; GENERIC:       # %bb.0:
   6918 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
   6919 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6920 ;
   6921 ; SKX-LABEL: test_16xi16_perm_high_mem_mask6:
   6922 ; SKX:       # %bb.0:
   6923 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
   6924 ; SKX-NEXT:    retq # sched: [7:1.00]
   6925   %vec = load <16 x i16>, <16 x i16>* %vp
   6926   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   6927   ret <16 x i16> %res
   6928 }
   6929 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   6930 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
   6931 ; GENERIC:       # %bb.0:
   6932 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6933 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
   6934 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6935 ;
   6936 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
   6937 ; SKX:       # %bb.0:
   6938 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6939 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
   6940 ; SKX-NEXT:    retq # sched: [7:1.00]
   6941   %vec = load <16 x i16>, <16 x i16>* %vp
   6942   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   6943   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6944   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6945   ret <16 x i16> %res
   6946 }
   6947 
   6948 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
   6949 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
   6950 ; GENERIC:       # %bb.0:
   6951 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
   6952 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
   6953 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6954 ;
   6955 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
   6956 ; SKX:       # %bb.0:
   6957 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
   6958 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
   6959 ; SKX-NEXT:    retq # sched: [7:1.00]
   6960   %vec = load <16 x i16>, <16 x i16>* %vp
   6961   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   6962   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6963   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   6964   ret <16 x i16> %res
   6965 }
   6966 
   6967 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
   6968 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
   6969 ; GENERIC:       # %bb.0:
   6970 ; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
   6971 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
   6972 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6973 ;
   6974 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
   6975 ; SKX:       # %bb.0:
   6976 ; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
   6977 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
   6978 ; SKX-NEXT:    retq # sched: [7:1.00]
   6979   %vec = load <16 x i16>, <16 x i16>* %vp
   6980   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   6981   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   6982   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   6983   ret <16 x i16> %res
   6984 }
   6985 
   6986 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
   6987 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
   6988 ; GENERIC:       # %bb.0:
   6989 ; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
   6990 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
   6991 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6992 ;
   6993 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
   6994 ; SKX:       # %bb.0:
   6995 ; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
   6996 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
   6997 ; SKX-NEXT:    retq # sched: [7:1.00]
   6998   %vec = load <16 x i16>, <16 x i16>* %vp
   6999   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   7000   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   7001   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   7002   ret <16 x i16> %res
   7003 }
   7004 
   7005 define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
   7006 ; GENERIC-LABEL: test_32xi16_perm_high_mask0:
   7007 ; GENERIC:       # %bb.0:
   7008 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
   7009 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7010 ;
   7011 ; SKX-LABEL: test_32xi16_perm_high_mask0:
   7012 ; SKX:       # %bb.0:
   7013 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
   7014 ; SKX-NEXT:    retq # sched: [7:1.00]
   7015   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   7016   ret <32 x i16> %res
   7017 }
   7018 define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   7019 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0:
   7020 ; GENERIC:       # %bb.0:
   7021 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
   7022 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
   7023 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   7024 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7025 ;
   7026 ; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
   7027 ; SKX:       # %bb.0:
   7028 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
   7029 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
   7030 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   7031 ; SKX-NEXT:    retq # sched: [7:1.00]
   7032   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   7033   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7034   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7035   ret <32 x i16> %res
   7036 }
   7037 
   7038 define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
   7039 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0:
   7040 ; GENERIC:       # %bb.0:
   7041 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7042 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
   7043 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7044 ;
   7045 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0:
   7046 ; SKX:       # %bb.0:
   7047 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7048 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
   7049 ; SKX-NEXT:    retq # sched: [7:1.00]
   7050   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   7051   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7052   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7053   ret <32 x i16> %res
   7054 }
   7055 define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   7056 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1:
   7057 ; GENERIC:       # %bb.0:
   7058 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
   7059 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
   7060 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   7061 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7062 ;
   7063 ; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
   7064 ; SKX:       # %bb.0:
   7065 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
   7066 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
   7067 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   7068 ; SKX-NEXT:    retq # sched: [7:1.00]
   7069   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   7070   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7071   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7072   ret <32 x i16> %res
   7073 }
   7074 
   7075 define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
   7076 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1:
   7077 ; GENERIC:       # %bb.0:
   7078 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7079 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
   7080 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7081 ;
   7082 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1:
   7083 ; SKX:       # %bb.0:
   7084 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7085 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
   7086 ; SKX-NEXT:    retq # sched: [7:1.00]
   7087   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   7088   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7089   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7090   ret <32 x i16> %res
   7091 }
   7092 define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   7093 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2:
   7094 ; GENERIC:       # %bb.0:
   7095 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
   7096 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
   7097 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   7098 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7099 ;
   7100 ; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
   7101 ; SKX:       # %bb.0:
   7102 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
   7103 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
   7104 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   7105 ; SKX-NEXT:    retq # sched: [7:1.00]
   7106   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   7107   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7108   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7109   ret <32 x i16> %res
   7110 }
   7111 
   7112 define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
   7113 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2:
   7114 ; GENERIC:       # %bb.0:
   7115 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7116 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
   7117 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7118 ;
   7119 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2:
   7120 ; SKX:       # %bb.0:
   7121 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7122 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
   7123 ; SKX-NEXT:    retq # sched: [7:1.00]
   7124   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   7125   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7126   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7127   ret <32 x i16> %res
   7128 }
   7129 define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
   7130 ; GENERIC-LABEL: test_32xi16_perm_low_mask3:
   7131 ; GENERIC:       # %bb.0:
   7132 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
   7133 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7134 ;
   7135 ; SKX-LABEL: test_32xi16_perm_low_mask3:
   7136 ; SKX:       # %bb.0:
   7137 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
   7138 ; SKX-NEXT:    retq # sched: [7:1.00]
   7139   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   7140   ret <32 x i16> %res
   7141 }
   7142 define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   7143 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3:
   7144 ; GENERIC:       # %bb.0:
   7145 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
   7146 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
   7147 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   7148 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7149 ;
   7150 ; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
   7151 ; SKX:       # %bb.0:
   7152 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
   7153 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
   7154 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   7155 ; SKX-NEXT:    retq # sched: [7:1.00]
   7156   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   7157   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7158   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7159   ret <32 x i16> %res
   7160 }
   7161 
   7162 define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
   7163 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3:
   7164 ; GENERIC:       # %bb.0:
   7165 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7166 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
   7167 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7168 ;
   7169 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3:
   7170 ; SKX:       # %bb.0:
   7171 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7172 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
   7173 ; SKX-NEXT:    retq # sched: [7:1.00]
   7174   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   7175   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7176   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7177   ret <32 x i16> %res
   7178 }
   7179 define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   7180 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4:
   7181 ; GENERIC:       # %bb.0:
   7182 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
   7183 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
   7184 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   7185 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7186 ;
   7187 ; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
   7188 ; SKX:       # %bb.0:
   7189 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
   7190 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
   7191 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   7192 ; SKX-NEXT:    retq # sched: [7:1.00]
   7193   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   7194   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7195   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7196   ret <32 x i16> %res
   7197 }
   7198 
   7199 define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
   7200 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4:
   7201 ; GENERIC:       # %bb.0:
   7202 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7203 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
   7204 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7205 ;
   7206 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4:
   7207 ; SKX:       # %bb.0:
   7208 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7209 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
   7210 ; SKX-NEXT:    retq # sched: [7:1.00]
   7211   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   7212   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7213   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7214   ret <32 x i16> %res
   7215 }
   7216 define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   7217 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5:
   7218 ; GENERIC:       # %bb.0:
   7219 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
   7220 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
   7221 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   7222 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7223 ;
   7224 ; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
   7225 ; SKX:       # %bb.0:
   7226 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
   7227 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
   7228 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   7229 ; SKX-NEXT:    retq # sched: [7:1.00]
   7230   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   7231   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7232   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7233   ret <32 x i16> %res
   7234 }
   7235 
   7236 define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
   7237 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5:
   7238 ; GENERIC:       # %bb.0:
   7239 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7240 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
   7241 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7242 ;
   7243 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5:
   7244 ; SKX:       # %bb.0:
   7245 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7246 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
   7247 ; SKX-NEXT:    retq # sched: [7:1.00]
   7248   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   7249   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7250   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7251   ret <32 x i16> %res
   7252 }
   7253 define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
   7254 ; GENERIC-LABEL: test_32xi16_perm_high_mask6:
   7255 ; GENERIC:       # %bb.0:
   7256 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
   7257 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7258 ;
   7259 ; SKX-LABEL: test_32xi16_perm_high_mask6:
   7260 ; SKX:       # %bb.0:
   7261 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
   7262 ; SKX-NEXT:    retq # sched: [7:1.00]
   7263   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   7264   ret <32 x i16> %res
   7265 }
   7266 define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   7267 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6:
   7268 ; GENERIC:       # %bb.0:
   7269 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
   7270 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
   7271 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   7272 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7273 ;
   7274 ; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
   7275 ; SKX:       # %bb.0:
   7276 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
   7277 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
   7278 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   7279 ; SKX-NEXT:    retq # sched: [7:1.00]
   7280   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   7281   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7282   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7283   ret <32 x i16> %res
   7284 }
   7285 
   7286 define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
   7287 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6:
   7288 ; GENERIC:       # %bb.0:
   7289 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7290 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
   7291 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7292 ;
   7293 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6:
   7294 ; SKX:       # %bb.0:
   7295 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7296 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
   7297 ; SKX-NEXT:    retq # sched: [7:1.00]
   7298   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   7299   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7300   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7301   ret <32 x i16> %res
   7302 }
   7303 define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
   7304 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7:
   7305 ; GENERIC:       # %bb.0:
   7306 ; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
   7307 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
   7308 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   7309 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7310 ;
   7311 ; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
   7312 ; SKX:       # %bb.0:
   7313 ; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
   7314 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
   7315 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   7316 ; SKX-NEXT:    retq # sched: [7:1.00]
   7317   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   7318   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7319   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7320   ret <32 x i16> %res
   7321 }
   7322 
   7323 define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
   7324 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7:
   7325 ; GENERIC:       # %bb.0:
   7326 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7327 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
   7328 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7329 ;
   7330 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7:
   7331 ; SKX:       # %bb.0:
   7332 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7333 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
   7334 ; SKX-NEXT:    retq # sched: [7:1.00]
   7335   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   7336   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7337   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7338   ret <32 x i16> %res
   7339 }
   7340 define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
   7341 ; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0:
   7342 ; GENERIC:       # %bb.0:
   7343 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
   7344 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7345 ;
   7346 ; SKX-LABEL: test_32xi16_perm_high_mem_mask0:
   7347 ; SKX:       # %bb.0:
   7348 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
   7349 ; SKX-NEXT:    retq # sched: [7:1.00]
   7350   %vec = load <32 x i16>, <32 x i16>* %vp
   7351   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   7352   ret <32 x i16> %res
   7353 }
   7354 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   7355 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
   7356 ; GENERIC:       # %bb.0:
   7357 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7358 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
   7359 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7360 ;
   7361 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
   7362 ; SKX:       # %bb.0:
   7363 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7364 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
   7365 ; SKX-NEXT:    retq # sched: [7:1.00]
   7366   %vec = load <32 x i16>, <32 x i16>* %vp
   7367   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   7368   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7369   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7370   ret <32 x i16> %res
   7371 }
   7372 
   7373 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
   7374 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
   7375 ; GENERIC:       # %bb.0:
   7376 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
   7377 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
   7378 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7379 ;
   7380 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
   7381 ; SKX:       # %bb.0:
   7382 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
   7383 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
   7384 ; SKX-NEXT:    retq # sched: [7:1.00]
   7385   %vec = load <32 x i16>, <32 x i16>* %vp
   7386   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   7387   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7388   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7389   ret <32 x i16> %res
   7390 }
   7391 
   7392 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   7393 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
   7394 ; GENERIC:       # %bb.0:
   7395 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7396 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
   7397 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7398 ;
   7399 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
   7400 ; SKX:       # %bb.0:
   7401 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7402 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
   7403 ; SKX-NEXT:    retq # sched: [7:1.00]
   7404   %vec = load <32 x i16>, <32 x i16>* %vp
   7405   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
   7406   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7407   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7408   ret <32 x i16> %res
   7409 }
   7410 
   7411 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
   7412 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
   7413 ; GENERIC:       # %bb.0:
   7414 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
   7415 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
   7416 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7417 ;
   7418 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
   7419 ; SKX:       # %bb.0:
   7420 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
   7421 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
   7422 ; SKX-NEXT:    retq # sched: [7:1.00]
   7423   %vec = load <32 x i16>, <32 x i16>* %vp
   7424   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
   7425   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7426   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7427   ret <32 x i16> %res
   7428 }
   7429 
   7430 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   7431 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
   7432 ; GENERIC:       # %bb.0:
   7433 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7434 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
   7435 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7436 ;
   7437 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
   7438 ; SKX:       # %bb.0:
   7439 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7440 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
   7441 ; SKX-NEXT:    retq # sched: [7:1.00]
   7442   %vec = load <32 x i16>, <32 x i16>* %vp
   7443   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
   7444   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7445   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7446   ret <32 x i16> %res
   7447 }
   7448 
   7449 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
   7450 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
   7451 ; GENERIC:       # %bb.0:
   7452 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
   7453 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
   7454 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7455 ;
   7456 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
   7457 ; SKX:       # %bb.0:
   7458 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
   7459 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
   7460 ; SKX-NEXT:    retq # sched: [7:1.00]
   7461   %vec = load <32 x i16>, <32 x i16>* %vp
   7462   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
   7463   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7464   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7465   ret <32 x i16> %res
   7466 }
   7467 
   7468 define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
   7469 ; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3:
   7470 ; GENERIC:       # %bb.0:
   7471 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
   7472 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7473 ;
   7474 ; SKX-LABEL: test_32xi16_perm_low_mem_mask3:
   7475 ; SKX:       # %bb.0:
   7476 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
   7477 ; SKX-NEXT:    retq # sched: [7:1.00]
   7478   %vec = load <32 x i16>, <32 x i16>* %vp
   7479   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   7480   ret <32 x i16> %res
   7481 }
   7482 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   7483 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
   7484 ; GENERIC:       # %bb.0:
   7485 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7486 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
   7487 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7488 ;
   7489 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
   7490 ; SKX:       # %bb.0:
   7491 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7492 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
   7493 ; SKX-NEXT:    retq # sched: [7:1.00]
   7494   %vec = load <32 x i16>, <32 x i16>* %vp
   7495   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   7496   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7497   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7498   ret <32 x i16> %res
   7499 }
   7500 
   7501 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
   7502 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
   7503 ; GENERIC:       # %bb.0:
   7504 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
   7505 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
   7506 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7507 ;
   7508 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
   7509 ; SKX:       # %bb.0:
   7510 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
   7511 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
   7512 ; SKX-NEXT:    retq # sched: [7:1.00]
   7513   %vec = load <32 x i16>, <32 x i16>* %vp
   7514   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   7515   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7516   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7517   ret <32 x i16> %res
   7518 }
   7519 
   7520 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   7521 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
   7522 ; GENERIC:       # %bb.0:
   7523 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7524 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
   7525 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7526 ;
   7527 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
   7528 ; SKX:       # %bb.0:
   7529 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7530 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
   7531 ; SKX-NEXT:    retq # sched: [7:1.00]
   7532   %vec = load <32 x i16>, <32 x i16>* %vp
   7533   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
   7534   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7535   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7536   ret <32 x i16> %res
   7537 }
   7538 
   7539 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
   7540 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
   7541 ; GENERIC:       # %bb.0:
   7542 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
   7543 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
   7544 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7545 ;
   7546 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
   7547 ; SKX:       # %bb.0:
   7548 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
   7549 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
   7550 ; SKX-NEXT:    retq # sched: [7:1.00]
   7551   %vec = load <32 x i16>, <32 x i16>* %vp
   7552   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
   7553   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7554   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7555   ret <32 x i16> %res
   7556 }
   7557 
   7558 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   7559 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
   7560 ; GENERIC:       # %bb.0:
   7561 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
   7562 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7563 ; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.50]
   7564 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7565 ;
   7566 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
   7567 ; SKX:       # %bb.0:
   7568 ; SKX-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
   7569 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7570 ; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
   7571 ; SKX-NEXT:    retq # sched: [7:1.00]
   7572   %vec = load <32 x i16>, <32 x i16>* %vp
   7573   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
   7574   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7575   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7576   ret <32 x i16> %res
   7577 }
   7578 
   7579 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
   7580 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
   7581 ; GENERIC:       # %bb.0:
   7582 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
   7583 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
   7584 ; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
   7585 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7586 ;
   7587 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
   7588 ; SKX:       # %bb.0:
   7589 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
   7590 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
   7591 ; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
   7592 ; SKX-NEXT:    retq # sched: [7:1.00]
   7593   %vec = load <32 x i16>, <32 x i16>* %vp
   7594   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
   7595   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7596   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7597   ret <32 x i16> %res
   7598 }
   7599 
   7600 define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
   7601 ; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6:
   7602 ; GENERIC:       # %bb.0:
   7603 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
   7604 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7605 ;
   7606 ; SKX-LABEL: test_32xi16_perm_high_mem_mask6:
   7607 ; SKX:       # %bb.0:
   7608 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
   7609 ; SKX-NEXT:    retq # sched: [7:1.00]
   7610   %vec = load <32 x i16>, <32 x i16>* %vp
   7611   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   7612   ret <32 x i16> %res
   7613 }
   7614 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   7615 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
   7616 ; GENERIC:       # %bb.0:
   7617 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7618 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
   7619 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7620 ;
   7621 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
   7622 ; SKX:       # %bb.0:
   7623 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7624 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
   7625 ; SKX-NEXT:    retq # sched: [7:1.00]
   7626   %vec = load <32 x i16>, <32 x i16>* %vp
   7627   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   7628   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7629   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7630   ret <32 x i16> %res
   7631 }
   7632 
   7633 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
   7634 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
   7635 ; GENERIC:       # %bb.0:
   7636 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
   7637 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
   7638 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7639 ;
   7640 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
   7641 ; SKX:       # %bb.0:
   7642 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
   7643 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
   7644 ; SKX-NEXT:    retq # sched: [7:1.00]
   7645   %vec = load <32 x i16>, <32 x i16>* %vp
   7646   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   7647   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7648   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7649   ret <32 x i16> %res
   7650 }
   7651 
   7652 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
   7653 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
   7654 ; GENERIC:       # %bb.0:
   7655 ; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
   7656 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
   7657 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7658 ;
   7659 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
   7660 ; SKX:       # %bb.0:
   7661 ; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
   7662 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
   7663 ; SKX-NEXT:    retq # sched: [7:1.00]
   7664   %vec = load <32 x i16>, <32 x i16>* %vp
   7665   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   7666   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7667   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   7668   ret <32 x i16> %res
   7669 }
   7670 
   7671 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
   7672 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
   7673 ; GENERIC:       # %bb.0:
   7674 ; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
   7675 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
   7676 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7677 ;
   7678 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
   7679 ; SKX:       # %bb.0:
   7680 ; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
   7681 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
   7682 ; SKX-NEXT:    retq # sched: [7:1.00]
   7683   %vec = load <32 x i16>, <32 x i16>* %vp
   7684   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   7685   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   7686   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   7687   ret <32 x i16> %res
   7688 }
   7689 
   7690 define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
   7691 ; GENERIC-LABEL: test_4xi32_perm_mask0:
   7692 ; GENERIC:       # %bb.0:
   7693 ; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
   7694 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7695 ;
   7696 ; SKX-LABEL: test_4xi32_perm_mask0:
   7697 ; SKX:       # %bb.0:
   7698 ; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
   7699 ; SKX-NEXT:    retq # sched: [7:1.00]
   7700   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   7701   ret <4 x i32> %res
   7702 }
   7703 define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
   7704 ; GENERIC-LABEL: test_masked_4xi32_perm_mask0:
   7705 ; GENERIC:       # %bb.0:
   7706 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   7707 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:0.50]
   7708 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   7709 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7710 ;
   7711 ; SKX-LABEL: test_masked_4xi32_perm_mask0:
   7712 ; SKX:       # %bb.0:
   7713 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   7714 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
   7715 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   7716 ; SKX-NEXT:    retq # sched: [7:1.00]
   7717   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   7718   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7719   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   7720   ret <4 x i32> %res
   7721 }
   7722 
   7723 define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
   7724 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0:
   7725 ; GENERIC:       # %bb.0:
   7726 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   7727 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:0.50]
   7728 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7729 ;
   7730 ; SKX-LABEL: test_masked_z_4xi32_perm_mask0:
   7731 ; SKX:       # %bb.0:
   7732 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   7733 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
   7734 ; SKX-NEXT:    retq # sched: [7:1.00]
   7735   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   7736   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7737   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   7738   ret <4 x i32> %res
   7739 }
   7740 define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
   7741 ; GENERIC-LABEL: test_masked_4xi32_perm_mask1:
   7742 ; GENERIC:       # %bb.0:
   7743 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   7744 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:0.50]
   7745 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   7746 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7747 ;
   7748 ; SKX-LABEL: test_masked_4xi32_perm_mask1:
   7749 ; SKX:       # %bb.0:
   7750 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   7751 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
   7752 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   7753 ; SKX-NEXT:    retq # sched: [7:1.00]
   7754   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   7755   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7756   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   7757   ret <4 x i32> %res
   7758 }
   7759 
   7760 define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
   7761 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1:
   7762 ; GENERIC:       # %bb.0:
   7763 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   7764 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:0.50]
   7765 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7766 ;
   7767 ; SKX-LABEL: test_masked_z_4xi32_perm_mask1:
   7768 ; SKX:       # %bb.0:
   7769 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   7770 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
   7771 ; SKX-NEXT:    retq # sched: [7:1.00]
   7772   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   7773   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7774   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   7775   ret <4 x i32> %res
   7776 }
   7777 define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
   7778 ; GENERIC-LABEL: test_masked_4xi32_perm_mask2:
   7779 ; GENERIC:       # %bb.0:
   7780 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   7781 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:0.50]
   7782 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   7783 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7784 ;
   7785 ; SKX-LABEL: test_masked_4xi32_perm_mask2:
   7786 ; SKX:       # %bb.0:
   7787 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   7788 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
   7789 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   7790 ; SKX-NEXT:    retq # sched: [7:1.00]
   7791   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   7792   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7793   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   7794   ret <4 x i32> %res
   7795 }
   7796 
   7797 define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
   7798 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2:
   7799 ; GENERIC:       # %bb.0:
   7800 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   7801 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:0.50]
   7802 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7803 ;
   7804 ; SKX-LABEL: test_masked_z_4xi32_perm_mask2:
   7805 ; SKX:       # %bb.0:
   7806 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   7807 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
   7808 ; SKX-NEXT:    retq # sched: [7:1.00]
   7809   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   7810   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7811   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   7812   ret <4 x i32> %res
   7813 }
   7814 define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
   7815 ; GENERIC-LABEL: test_4xi32_perm_mask3:
   7816 ; GENERIC:       # %bb.0:
   7817 ; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
   7818 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7819 ;
   7820 ; SKX-LABEL: test_4xi32_perm_mask3:
   7821 ; SKX:       # %bb.0:
   7822 ; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
   7823 ; SKX-NEXT:    retq # sched: [7:1.00]
   7824   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   7825   ret <4 x i32> %res
   7826 }
   7827 define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
   7828 ; GENERIC-LABEL: test_masked_4xi32_perm_mask3:
   7829 ; GENERIC:       # %bb.0:
   7830 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   7831 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:0.50]
   7832 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   7833 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7834 ;
   7835 ; SKX-LABEL: test_masked_4xi32_perm_mask3:
   7836 ; SKX:       # %bb.0:
   7837 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   7838 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
   7839 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
   7840 ; SKX-NEXT:    retq # sched: [7:1.00]
   7841   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   7842   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7843   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   7844   ret <4 x i32> %res
   7845 }
   7846 
   7847 define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
   7848 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3:
   7849 ; GENERIC:       # %bb.0:
   7850 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   7851 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:0.50]
   7852 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7853 ;
   7854 ; SKX-LABEL: test_masked_z_4xi32_perm_mask3:
   7855 ; SKX:       # %bb.0:
   7856 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   7857 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
   7858 ; SKX-NEXT:    retq # sched: [7:1.00]
   7859   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   7860   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7861   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   7862   ret <4 x i32> %res
   7863 }
   7864 define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
   7865 ; GENERIC-LABEL: test_4xi32_perm_mem_mask0:
   7866 ; GENERIC:       # %bb.0:
   7867 ; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
   7868 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7869 ;
   7870 ; SKX-LABEL: test_4xi32_perm_mem_mask0:
   7871 ; SKX:       # %bb.0:
   7872 ; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
   7873 ; SKX-NEXT:    retq # sched: [7:1.00]
   7874   %vec = load <4 x i32>, <4 x i32>* %vp
   7875   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   7876   ret <4 x i32> %res
   7877 }
   7878 define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
   7879 ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0:
   7880 ; GENERIC:       # %bb.0:
   7881 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   7882 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:0.50]
   7883 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7884 ;
   7885 ; SKX-LABEL: test_masked_4xi32_perm_mem_mask0:
   7886 ; SKX:       # %bb.0:
   7887 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   7888 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
   7889 ; SKX-NEXT:    retq # sched: [7:1.00]
   7890   %vec = load <4 x i32>, <4 x i32>* %vp
   7891   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   7892   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7893   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   7894   ret <4 x i32> %res
   7895 }
   7896 
   7897 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
   7898 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0:
   7899 ; GENERIC:       # %bb.0:
   7900 ; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
   7901 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:0.50]
   7902 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7903 ;
   7904 ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0:
   7905 ; SKX:       # %bb.0:
   7906 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
   7907 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
   7908 ; SKX-NEXT:    retq # sched: [7:1.00]
   7909   %vec = load <4 x i32>, <4 x i32>* %vp
   7910   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   7911   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7912   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   7913   ret <4 x i32> %res
   7914 }
   7915 
   7916 define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
   7917 ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1:
   7918 ; GENERIC:       # %bb.0:
   7919 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   7920 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:0.50]
   7921 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7922 ;
   7923 ; SKX-LABEL: test_masked_4xi32_perm_mem_mask1:
   7924 ; SKX:       # %bb.0:
   7925 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   7926 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
   7927 ; SKX-NEXT:    retq # sched: [7:1.00]
   7928   %vec = load <4 x i32>, <4 x i32>* %vp
   7929   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
   7930   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7931   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   7932   ret <4 x i32> %res
   7933 }
   7934 
   7935 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
   7936 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1:
   7937 ; GENERIC:       # %bb.0:
   7938 ; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
   7939 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:0.50]
   7940 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7941 ;
   7942 ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1:
   7943 ; SKX:       # %bb.0:
   7944 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
   7945 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
   7946 ; SKX-NEXT:    retq # sched: [7:1.00]
   7947   %vec = load <4 x i32>, <4 x i32>* %vp
   7948   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
   7949   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7950   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   7951   ret <4 x i32> %res
   7952 }
   7953 
   7954 define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
   7955 ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2:
   7956 ; GENERIC:       # %bb.0:
   7957 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   7958 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:0.50]
   7959 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7960 ;
   7961 ; SKX-LABEL: test_masked_4xi32_perm_mem_mask2:
   7962 ; SKX:       # %bb.0:
   7963 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   7964 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
   7965 ; SKX-NEXT:    retq # sched: [7:1.00]
   7966   %vec = load <4 x i32>, <4 x i32>* %vp
   7967   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
   7968   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7969   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   7970   ret <4 x i32> %res
   7971 }
   7972 
   7973 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
   7974 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2:
   7975 ; GENERIC:       # %bb.0:
   7976 ; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
   7977 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:0.50]
   7978 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7979 ;
   7980 ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2:
   7981 ; SKX:       # %bb.0:
   7982 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
   7983 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
   7984 ; SKX-NEXT:    retq # sched: [7:1.00]
   7985   %vec = load <4 x i32>, <4 x i32>* %vp
   7986   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
   7987   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   7988   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   7989   ret <4 x i32> %res
   7990 }
   7991 
   7992 define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
   7993 ; GENERIC-LABEL: test_4xi32_perm_mem_mask3:
   7994 ; GENERIC:       # %bb.0:
   7995 ; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
   7996 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7997 ;
   7998 ; SKX-LABEL: test_4xi32_perm_mem_mask3:
   7999 ; SKX:       # %bb.0:
   8000 ; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
   8001 ; SKX-NEXT:    retq # sched: [7:1.00]
   8002   %vec = load <4 x i32>, <4 x i32>* %vp
   8003   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   8004   ret <4 x i32> %res
   8005 }
   8006 define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
   8007 ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3:
   8008 ; GENERIC:       # %bb.0:
   8009 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   8010 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:0.50]
   8011 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8012 ;
   8013 ; SKX-LABEL: test_masked_4xi32_perm_mem_mask3:
   8014 ; SKX:       # %bb.0:
   8015 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   8016 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
   8017 ; SKX-NEXT:    retq # sched: [7:1.00]
   8018   %vec = load <4 x i32>, <4 x i32>* %vp
   8019   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   8020   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   8021   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   8022   ret <4 x i32> %res
   8023 }
   8024 
   8025 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
   8026 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3:
   8027 ; GENERIC:       # %bb.0:
   8028 ; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
   8029 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:0.50]
   8030 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8031 ;
   8032 ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3:
   8033 ; SKX:       # %bb.0:
   8034 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
   8035 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
   8036 ; SKX-NEXT:    retq # sched: [7:1.00]
   8037   %vec = load <4 x i32>, <4 x i32>* %vp
   8038   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   8039   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   8040   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   8041   ret <4 x i32> %res
   8042 }
   8043 
   8044 define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) {
   8045 ; GENERIC-LABEL: test2_8xi32_perm_mask0:
   8046 ; GENERIC:       # %bb.0:
   8047 ; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
   8048 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8049 ;
   8050 ; SKX-LABEL: test2_8xi32_perm_mask0:
   8051 ; SKX:       # %bb.0:
   8052 ; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
   8053 ; SKX-NEXT:    retq # sched: [7:1.00]
   8054   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   8055   ret <8 x i32> %res
   8056 }
   8057 define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
   8058 ; GENERIC-LABEL: test2_masked_8xi32_perm_mask0:
   8059 ; GENERIC:       # %bb.0:
   8060 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8061 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
   8062 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   8063 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8064 ;
   8065 ; SKX-LABEL: test2_masked_8xi32_perm_mask0:
   8066 ; SKX:       # %bb.0:
   8067 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8068 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
   8069 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   8070 ; SKX-NEXT:    retq # sched: [7:1.00]
   8071   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   8072   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8073   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   8074   ret <8 x i32> %res
   8075 }
   8076 
   8077 define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
   8078 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0:
   8079 ; GENERIC:       # %bb.0:
   8080 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8081 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
   8082 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8083 ;
   8084 ; SKX-LABEL: test2_masked_z_8xi32_perm_mask0:
   8085 ; SKX:       # %bb.0:
   8086 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8087 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
   8088 ; SKX-NEXT:    retq # sched: [7:1.00]
   8089   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   8090   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8091   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   8092   ret <8 x i32> %res
   8093 }
   8094 define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
   8095 ; GENERIC-LABEL: test2_masked_8xi32_perm_mask1:
   8096 ; GENERIC:       # %bb.0:
   8097 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8098 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
   8099 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   8100 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8101 ;
   8102 ; SKX-LABEL: test2_masked_8xi32_perm_mask1:
   8103 ; SKX:       # %bb.0:
   8104 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8105 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
   8106 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   8107 ; SKX-NEXT:    retq # sched: [7:1.00]
   8108   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   8109   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8110   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   8111   ret <8 x i32> %res
   8112 }
   8113 
   8114 define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
   8115 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1:
   8116 ; GENERIC:       # %bb.0:
   8117 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8118 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
   8119 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8120 ;
   8121 ; SKX-LABEL: test2_masked_z_8xi32_perm_mask1:
   8122 ; SKX:       # %bb.0:
   8123 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8124 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
   8125 ; SKX-NEXT:    retq # sched: [7:1.00]
   8126   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   8127   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8128   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   8129   ret <8 x i32> %res
   8130 }
   8131 define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
   8132 ; GENERIC-LABEL: test2_masked_8xi32_perm_mask2:
   8133 ; GENERIC:       # %bb.0:
   8134 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8135 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
   8136 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   8137 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8138 ;
   8139 ; SKX-LABEL: test2_masked_8xi32_perm_mask2:
   8140 ; SKX:       # %bb.0:
   8141 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8142 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
   8143 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   8144 ; SKX-NEXT:    retq # sched: [7:1.00]
   8145   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   8146   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8147   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   8148   ret <8 x i32> %res
   8149 }
   8150 
   8151 define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
   8152 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2:
   8153 ; GENERIC:       # %bb.0:
   8154 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8155 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
   8156 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8157 ;
   8158 ; SKX-LABEL: test2_masked_z_8xi32_perm_mask2:
   8159 ; SKX:       # %bb.0:
   8160 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8161 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
   8162 ; SKX-NEXT:    retq # sched: [7:1.00]
   8163   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   8164   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8165   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   8166   ret <8 x i32> %res
   8167 }
   8168 define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) {
   8169 ; GENERIC-LABEL: test2_8xi32_perm_mask3:
   8170 ; GENERIC:       # %bb.0:
   8171 ; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
   8172 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8173 ;
   8174 ; SKX-LABEL: test2_8xi32_perm_mask3:
   8175 ; SKX:       # %bb.0:
   8176 ; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
   8177 ; SKX-NEXT:    retq # sched: [7:1.00]
   8178   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   8179   ret <8 x i32> %res
   8180 }
   8181 define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
   8182 ; GENERIC-LABEL: test2_masked_8xi32_perm_mask3:
   8183 ; GENERIC:       # %bb.0:
   8184 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8185 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
   8186 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   8187 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8188 ;
   8189 ; SKX-LABEL: test2_masked_8xi32_perm_mask3:
   8190 ; SKX:       # %bb.0:
   8191 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8192 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
   8193 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   8194 ; SKX-NEXT:    retq # sched: [7:1.00]
   8195   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   8196   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8197   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   8198   ret <8 x i32> %res
   8199 }
   8200 
   8201 define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
   8202 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3:
   8203 ; GENERIC:       # %bb.0:
   8204 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8205 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
   8206 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8207 ;
   8208 ; SKX-LABEL: test2_masked_z_8xi32_perm_mask3:
   8209 ; SKX:       # %bb.0:
   8210 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8211 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
   8212 ; SKX-NEXT:    retq # sched: [7:1.00]
   8213   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   8214   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8215   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   8216   ret <8 x i32> %res
   8217 }
   8218 define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
   8219 ; GENERIC-LABEL: test2_8xi32_perm_mem_mask0:
   8220 ; GENERIC:       # %bb.0:
   8221 ; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
   8222 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8223 ;
   8224 ; SKX-LABEL: test2_8xi32_perm_mem_mask0:
   8225 ; SKX:       # %bb.0:
   8226 ; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
   8227 ; SKX-NEXT:    retq # sched: [7:1.00]
   8228   %vec = load <8 x i32>, <8 x i32>* %vp
   8229   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   8230   ret <8 x i32> %res
   8231 }
   8232 define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   8233 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
   8234 ; GENERIC:       # %bb.0:
   8235 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8236 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
   8237 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8238 ;
   8239 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
   8240 ; SKX:       # %bb.0:
   8241 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8242 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
   8243 ; SKX-NEXT:    retq # sched: [7:1.00]
   8244   %vec = load <8 x i32>, <8 x i32>* %vp
   8245   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   8246   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8247   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   8248   ret <8 x i32> %res
   8249 }
   8250 
   8251 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
   8252 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
   8253 ; GENERIC:       # %bb.0:
   8254 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   8255 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
   8256 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8257 ;
   8258 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
   8259 ; SKX:       # %bb.0:
   8260 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   8261 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
   8262 ; SKX-NEXT:    retq # sched: [7:1.00]
   8263   %vec = load <8 x i32>, <8 x i32>* %vp
   8264   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   8265   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8266   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   8267   ret <8 x i32> %res
   8268 }
   8269 
   8270 define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   8271 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
   8272 ; GENERIC:       # %bb.0:
   8273 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8274 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
   8275 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8276 ;
   8277 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
   8278 ; SKX:       # %bb.0:
   8279 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8280 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
   8281 ; SKX-NEXT:    retq # sched: [7:1.00]
   8282   %vec = load <8 x i32>, <8 x i32>* %vp
   8283   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   8284   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8285   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   8286   ret <8 x i32> %res
   8287 }
   8288 
   8289 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
   8290 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
   8291 ; GENERIC:       # %bb.0:
   8292 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   8293 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
   8294 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8295 ;
   8296 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
   8297 ; SKX:       # %bb.0:
   8298 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   8299 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
   8300 ; SKX-NEXT:    retq # sched: [7:1.00]
   8301   %vec = load <8 x i32>, <8 x i32>* %vp
   8302   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   8303   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8304   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   8305   ret <8 x i32> %res
   8306 }
   8307 
   8308 define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   8309 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
   8310 ; GENERIC:       # %bb.0:
   8311 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8312 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
   8313 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8314 ;
   8315 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
   8316 ; SKX:       # %bb.0:
   8317 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8318 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
   8319 ; SKX-NEXT:    retq # sched: [7:1.00]
   8320   %vec = load <8 x i32>, <8 x i32>* %vp
   8321   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
   8322   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8323   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   8324   ret <8 x i32> %res
   8325 }
   8326 
   8327 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
   8328 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
   8329 ; GENERIC:       # %bb.0:
   8330 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   8331 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
   8332 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8333 ;
   8334 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
   8335 ; SKX:       # %bb.0:
   8336 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   8337 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
   8338 ; SKX-NEXT:    retq # sched: [7:1.00]
   8339   %vec = load <8 x i32>, <8 x i32>* %vp
   8340   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
   8341   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8342   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   8343   ret <8 x i32> %res
   8344 }
   8345 
   8346 define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
   8347 ; GENERIC-LABEL: test2_8xi32_perm_mem_mask3:
   8348 ; GENERIC:       # %bb.0:
   8349 ; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
   8350 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8351 ;
   8352 ; SKX-LABEL: test2_8xi32_perm_mem_mask3:
   8353 ; SKX:       # %bb.0:
   8354 ; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
   8355 ; SKX-NEXT:    retq # sched: [7:1.00]
   8356   %vec = load <8 x i32>, <8 x i32>* %vp
   8357   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   8358   ret <8 x i32> %res
   8359 }
   8360 define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
   8361 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
   8362 ; GENERIC:       # %bb.0:
   8363 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8364 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
   8365 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8366 ;
   8367 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
   8368 ; SKX:       # %bb.0:
   8369 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8370 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
   8371 ; SKX-NEXT:    retq # sched: [7:1.00]
   8372   %vec = load <8 x i32>, <8 x i32>* %vp
   8373   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   8374   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8375   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   8376   ret <8 x i32> %res
   8377 }
   8378 
   8379 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
   8380 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
   8381 ; GENERIC:       # %bb.0:
   8382 ; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
   8383 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
   8384 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8385 ;
   8386 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
   8387 ; SKX:       # %bb.0:
   8388 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
   8389 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
   8390 ; SKX-NEXT:    retq # sched: [7:1.00]
   8391   %vec = load <8 x i32>, <8 x i32>* %vp
   8392   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   8393   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8394   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   8395   ret <8 x i32> %res
   8396 }
   8397 
   8398 define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
   8399 ; GENERIC-LABEL: test2_16xi32_perm_mask0:
   8400 ; GENERIC:       # %bb.0:
   8401 ; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
   8402 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8403 ;
   8404 ; SKX-LABEL: test2_16xi32_perm_mask0:
   8405 ; SKX:       # %bb.0:
   8406 ; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
   8407 ; SKX-NEXT:    retq # sched: [7:1.00]
   8408   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   8409   ret <16 x i32> %res
   8410 }
   8411 define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   8412 ; GENERIC-LABEL: test2_masked_16xi32_perm_mask0:
   8413 ; GENERIC:       # %bb.0:
   8414 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   8415 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
   8416 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   8417 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8418 ;
   8419 ; SKX-LABEL: test2_masked_16xi32_perm_mask0:
   8420 ; SKX:       # %bb.0:
   8421 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   8422 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
   8423 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   8424 ; SKX-NEXT:    retq # sched: [7:1.00]
   8425   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   8426   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8427   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   8428   ret <16 x i32> %res
   8429 }
   8430 
   8431 define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
   8432 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0:
   8433 ; GENERIC:       # %bb.0:
   8434 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   8435 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
   8436 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8437 ;
   8438 ; SKX-LABEL: test2_masked_z_16xi32_perm_mask0:
   8439 ; SKX:       # %bb.0:
   8440 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   8441 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
   8442 ; SKX-NEXT:    retq # sched: [7:1.00]
   8443   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   8444   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8445   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   8446   ret <16 x i32> %res
   8447 }
   8448 define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   8449 ; GENERIC-LABEL: test2_masked_16xi32_perm_mask1:
   8450 ; GENERIC:       # %bb.0:
   8451 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   8452 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
   8453 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   8454 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8455 ;
   8456 ; SKX-LABEL: test2_masked_16xi32_perm_mask1:
   8457 ; SKX:       # %bb.0:
   8458 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   8459 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
   8460 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   8461 ; SKX-NEXT:    retq # sched: [7:1.00]
   8462   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   8463   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8464   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   8465   ret <16 x i32> %res
   8466 }
   8467 
   8468 define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
   8469 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1:
   8470 ; GENERIC:       # %bb.0:
   8471 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   8472 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
   8473 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8474 ;
   8475 ; SKX-LABEL: test2_masked_z_16xi32_perm_mask1:
   8476 ; SKX:       # %bb.0:
   8477 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   8478 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
   8479 ; SKX-NEXT:    retq # sched: [7:1.00]
   8480   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   8481   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8482   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   8483   ret <16 x i32> %res
   8484 }
   8485 define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   8486 ; GENERIC-LABEL: test2_masked_16xi32_perm_mask2:
   8487 ; GENERIC:       # %bb.0:
   8488 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   8489 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
   8490 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   8491 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8492 ;
   8493 ; SKX-LABEL: test2_masked_16xi32_perm_mask2:
   8494 ; SKX:       # %bb.0:
   8495 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   8496 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
   8497 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   8498 ; SKX-NEXT:    retq # sched: [7:1.00]
   8499   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   8500   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8501   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   8502   ret <16 x i32> %res
   8503 }
   8504 
   8505 define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
   8506 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2:
   8507 ; GENERIC:       # %bb.0:
   8508 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   8509 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
   8510 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8511 ;
   8512 ; SKX-LABEL: test2_masked_z_16xi32_perm_mask2:
   8513 ; SKX:       # %bb.0:
   8514 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   8515 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
   8516 ; SKX-NEXT:    retq # sched: [7:1.00]
   8517   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   8518   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8519   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   8520   ret <16 x i32> %res
   8521 }
   8522 define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
   8523 ; GENERIC-LABEL: test2_16xi32_perm_mask3:
   8524 ; GENERIC:       # %bb.0:
   8525 ; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
   8526 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8527 ;
   8528 ; SKX-LABEL: test2_16xi32_perm_mask3:
   8529 ; SKX:       # %bb.0:
   8530 ; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
   8531 ; SKX-NEXT:    retq # sched: [7:1.00]
   8532   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   8533   ret <16 x i32> %res
   8534 }
   8535 define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
   8536 ; GENERIC-LABEL: test2_masked_16xi32_perm_mask3:
   8537 ; GENERIC:       # %bb.0:
   8538 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   8539 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
   8540 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   8541 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8542 ;
   8543 ; SKX-LABEL: test2_masked_16xi32_perm_mask3:
   8544 ; SKX:       # %bb.0:
   8545 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   8546 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
   8547 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   8548 ; SKX-NEXT:    retq # sched: [7:1.00]
   8549   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   8550   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8551   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   8552   ret <16 x i32> %res
   8553 }
   8554 
   8555 define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
   8556 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3:
   8557 ; GENERIC:       # %bb.0:
   8558 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   8559 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
   8560 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8561 ;
   8562 ; SKX-LABEL: test2_masked_z_16xi32_perm_mask3:
   8563 ; SKX:       # %bb.0:
   8564 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   8565 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
   8566 ; SKX-NEXT:    retq # sched: [7:1.00]
   8567   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   8568   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8569   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   8570   ret <16 x i32> %res
   8571 }
   8572 define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
   8573 ; GENERIC-LABEL: test2_16xi32_perm_mem_mask0:
   8574 ; GENERIC:       # %bb.0:
   8575 ; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
   8576 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8577 ;
   8578 ; SKX-LABEL: test2_16xi32_perm_mem_mask0:
   8579 ; SKX:       # %bb.0:
   8580 ; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
   8581 ; SKX-NEXT:    retq # sched: [7:1.00]
   8582   %vec = load <16 x i32>, <16 x i32>* %vp
   8583   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   8584   ret <16 x i32> %res
   8585 }
   8586 define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   8587 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
   8588 ; GENERIC:       # %bb.0:
   8589 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   8590 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
   8591 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8592 ;
   8593 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
   8594 ; SKX:       # %bb.0:
   8595 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   8596 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
   8597 ; SKX-NEXT:    retq # sched: [7:1.00]
   8598   %vec = load <16 x i32>, <16 x i32>* %vp
   8599   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   8600   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8601   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   8602   ret <16 x i32> %res
   8603 }
   8604 
   8605 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
   8606 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
   8607 ; GENERIC:       # %bb.0:
   8608 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   8609 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
   8610 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8611 ;
   8612 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
   8613 ; SKX:       # %bb.0:
   8614 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   8615 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
   8616 ; SKX-NEXT:    retq # sched: [7:1.00]
   8617   %vec = load <16 x i32>, <16 x i32>* %vp
   8618   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   8619   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8620   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   8621   ret <16 x i32> %res
   8622 }
   8623 
   8624 define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   8625 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
   8626 ; GENERIC:       # %bb.0:
   8627 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   8628 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
   8629 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8630 ;
   8631 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
   8632 ; SKX:       # %bb.0:
   8633 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   8634 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
   8635 ; SKX-NEXT:    retq # sched: [7:1.00]
   8636   %vec = load <16 x i32>, <16 x i32>* %vp
   8637   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
   8638   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8639   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   8640   ret <16 x i32> %res
   8641 }
   8642 
   8643 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
   8644 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
   8645 ; GENERIC:       # %bb.0:
   8646 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   8647 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
   8648 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8649 ;
   8650 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
   8651 ; SKX:       # %bb.0:
   8652 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   8653 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
   8654 ; SKX-NEXT:    retq # sched: [7:1.00]
   8655   %vec = load <16 x i32>, <16 x i32>* %vp
   8656   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
   8657   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8658   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   8659   ret <16 x i32> %res
   8660 }
   8661 
   8662 define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   8663 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
   8664 ; GENERIC:       # %bb.0:
   8665 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   8666 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
   8667 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8668 ;
   8669 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
   8670 ; SKX:       # %bb.0:
   8671 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   8672 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
   8673 ; SKX-NEXT:    retq # sched: [7:1.00]
   8674   %vec = load <16 x i32>, <16 x i32>* %vp
   8675   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
   8676   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8677   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   8678   ret <16 x i32> %res
   8679 }
   8680 
   8681 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
   8682 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
   8683 ; GENERIC:       # %bb.0:
   8684 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   8685 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
   8686 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8687 ;
   8688 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
   8689 ; SKX:       # %bb.0:
   8690 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   8691 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
   8692 ; SKX-NEXT:    retq # sched: [7:1.00]
   8693   %vec = load <16 x i32>, <16 x i32>* %vp
   8694   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
   8695   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8696   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   8697   ret <16 x i32> %res
   8698 }
   8699 
   8700 define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
   8701 ; GENERIC-LABEL: test2_16xi32_perm_mem_mask3:
   8702 ; GENERIC:       # %bb.0:
   8703 ; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
   8704 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8705 ;
   8706 ; SKX-LABEL: test2_16xi32_perm_mem_mask3:
   8707 ; SKX:       # %bb.0:
   8708 ; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
   8709 ; SKX-NEXT:    retq # sched: [7:1.00]
   8710   %vec = load <16 x i32>, <16 x i32>* %vp
   8711   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   8712   ret <16 x i32> %res
   8713 }
   8714 define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
   8715 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
   8716 ; GENERIC:       # %bb.0:
   8717 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   8718 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
   8719 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8720 ;
   8721 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
   8722 ; SKX:       # %bb.0:
   8723 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   8724 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
   8725 ; SKX-NEXT:    retq # sched: [7:1.00]
   8726   %vec = load <16 x i32>, <16 x i32>* %vp
   8727   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   8728   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8729   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   8730   ret <16 x i32> %res
   8731 }
   8732 
   8733 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
   8734 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
   8735 ; GENERIC:       # %bb.0:
   8736 ; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
   8737 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
   8738 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8739 ;
   8740 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
   8741 ; SKX:       # %bb.0:
   8742 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
   8743 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
   8744 ; SKX-NEXT:    retq # sched: [7:1.00]
   8745   %vec = load <16 x i32>, <16 x i32>* %vp
   8746   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   8747   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   8748   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   8749   ret <16 x i32> %res
   8750 }
   8751 
   8752 define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
   8753 ; GENERIC-LABEL: test2_8xfloat_shuff_mask0:
   8754 ; GENERIC:       # %bb.0:
   8755 ; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   8756 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8757 ;
   8758 ; SKX-LABEL: test2_8xfloat_shuff_mask0:
   8759 ; SKX:       # %bb.0:
   8760 ; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   8761 ; SKX-NEXT:    retq # sched: [7:1.00]
   8762   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   8763   ret <8 x float> %res
   8764 }
   8765 define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   8766 ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
   8767 ; GENERIC:       # %bb.0:
   8768 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   8769 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   8770 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   8771 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8772 ;
   8773 ; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
   8774 ; SKX:       # %bb.0:
   8775 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   8776 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   8777 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   8778 ; SKX-NEXT:    retq # sched: [7:1.00]
   8779   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   8780   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8781   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   8782   ret <8 x float> %res
   8783 }
   8784 
   8785 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   8786 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
   8787 ; GENERIC:       # %bb.0:
   8788 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8789 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   8790 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8791 ;
   8792 ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
   8793 ; SKX:       # %bb.0:
   8794 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8795 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   8796 ; SKX-NEXT:    retq # sched: [7:1.00]
   8797   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   8798   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8799   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   8800   ret <8 x float> %res
   8801 }
   8802 define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   8803 ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
   8804 ; GENERIC:       # %bb.0:
   8805 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   8806 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   8807 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   8808 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8809 ;
   8810 ; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
   8811 ; SKX:       # %bb.0:
   8812 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   8813 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   8814 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   8815 ; SKX-NEXT:    retq # sched: [7:1.00]
   8816   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   8817   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8818   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   8819   ret <8 x float> %res
   8820 }
   8821 
   8822 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   8823 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
   8824 ; GENERIC:       # %bb.0:
   8825 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8826 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   8827 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8828 ;
   8829 ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
   8830 ; SKX:       # %bb.0:
   8831 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8832 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   8833 ; SKX-NEXT:    retq # sched: [7:1.00]
   8834   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   8835   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8836   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   8837   ret <8 x float> %res
   8838 }
   8839 define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   8840 ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
   8841 ; GENERIC:       # %bb.0:
   8842 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   8843 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
   8844 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   8845 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8846 ;
   8847 ; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
   8848 ; SKX:       # %bb.0:
   8849 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   8850 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
   8851 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   8852 ; SKX-NEXT:    retq # sched: [7:1.00]
   8853   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   8854   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8855   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   8856   ret <8 x float> %res
   8857 }
   8858 
   8859 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   8860 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
   8861 ; GENERIC:       # %bb.0:
   8862 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8863 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
   8864 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8865 ;
   8866 ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
   8867 ; SKX:       # %bb.0:
   8868 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8869 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
   8870 ; SKX-NEXT:    retq # sched: [7:1.00]
   8871   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   8872   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8873   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   8874   ret <8 x float> %res
   8875 }
   8876 define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
   8877 ; GENERIC-LABEL: test2_8xfloat_shuff_mask3:
   8878 ; GENERIC:       # %bb.0:
   8879 ; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   8880 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8881 ;
   8882 ; SKX-LABEL: test2_8xfloat_shuff_mask3:
   8883 ; SKX:       # %bb.0:
   8884 ; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   8885 ; SKX-NEXT:    retq # sched: [7:1.00]
   8886   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   8887   ret <8 x float> %res
   8888 }
   8889 define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   8890 ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
   8891 ; GENERIC:       # %bb.0:
   8892 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   8893 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   8894 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   8895 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8896 ;
   8897 ; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
   8898 ; SKX:       # %bb.0:
   8899 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   8900 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   8901 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   8902 ; SKX-NEXT:    retq # sched: [7:1.00]
   8903   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   8904   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8905   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   8906   ret <8 x float> %res
   8907 }
   8908 
   8909 define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   8910 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
   8911 ; GENERIC:       # %bb.0:
   8912 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8913 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   8914 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8915 ;
   8916 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
   8917 ; SKX:       # %bb.0:
   8918 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8919 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   8920 ; SKX-NEXT:    retq # sched: [7:1.00]
   8921   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   8922   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8923   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   8924   ret <8 x float> %res
   8925 }
   8926 define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
   8927 ; GENERIC-LABEL: test_8xfloat_shuff_mem_mask0:
   8928 ; GENERIC:       # %bb.0:
   8929 ; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
   8930 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8931 ;
   8932 ; SKX-LABEL: test_8xfloat_shuff_mem_mask0:
   8933 ; SKX:       # %bb.0:
   8934 ; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
   8935 ; SKX-NEXT:    retq # sched: [7:1.00]
   8936   %vec2 = load <8 x float>, <8 x float>* %vec2p
   8937   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   8938   ret <8 x float> %res
   8939 }
   8940 define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   8941 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
   8942 ; GENERIC:       # %bb.0:
   8943 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8944 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
   8945 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   8946 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8947 ;
   8948 ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
   8949 ; SKX:       # %bb.0:
   8950 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8951 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
   8952 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   8953 ; SKX-NEXT:    retq # sched: [7:1.00]
   8954   %vec2 = load <8 x float>, <8 x float>* %vec2p
   8955   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   8956   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8957   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   8958   ret <8 x float> %res
   8959 }
   8960 
   8961 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   8962 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
   8963 ; GENERIC:       # %bb.0:
   8964 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   8965 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
   8966 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8967 ;
   8968 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
   8969 ; SKX:       # %bb.0:
   8970 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   8971 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
   8972 ; SKX-NEXT:    retq # sched: [7:1.00]
   8973   %vec2 = load <8 x float>, <8 x float>* %vec2p
   8974   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   8975   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8976   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   8977   ret <8 x float> %res
   8978 }
   8979 
   8980 define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   8981 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
   8982 ; GENERIC:       # %bb.0:
   8983 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   8984 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
   8985 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   8986 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   8987 ;
   8988 ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
   8989 ; SKX:       # %bb.0:
   8990 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   8991 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
   8992 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   8993 ; SKX-NEXT:    retq # sched: [7:1.00]
   8994   %vec2 = load <8 x float>, <8 x float>* %vec2p
   8995   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   8996   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   8997   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   8998   ret <8 x float> %res
   8999 }
   9000 
   9001 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   9002 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
   9003 ; GENERIC:       # %bb.0:
   9004 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   9005 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
   9006 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9007 ;
   9008 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
   9009 ; SKX:       # %bb.0:
   9010 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   9011 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
   9012 ; SKX-NEXT:    retq # sched: [7:1.00]
   9013   %vec2 = load <8 x float>, <8 x float>* %vec2p
   9014   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   9015   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   9016   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   9017   ret <8 x float> %res
   9018 }
   9019 
   9020 define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   9021 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
   9022 ; GENERIC:       # %bb.0:
   9023 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9024 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   9025 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   9026 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9027 ;
   9028 ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
   9029 ; SKX:       # %bb.0:
   9030 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9031 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   9032 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   9033 ; SKX-NEXT:    retq # sched: [7:1.00]
   9034   %vec2 = load <8 x float>, <8 x float>* %vec2p
   9035   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   9036   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   9037   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   9038   ret <8 x float> %res
   9039 }
   9040 
   9041 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   9042 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
   9043 ; GENERIC:       # %bb.0:
   9044 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   9045 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   9046 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9047 ;
   9048 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
   9049 ; SKX:       # %bb.0:
   9050 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   9051 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   9052 ; SKX-NEXT:    retq # sched: [7:1.00]
   9053   %vec2 = load <8 x float>, <8 x float>* %vec2p
   9054   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   9055   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   9056   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   9057   ret <8 x float> %res
   9058 }
   9059 
   9060 define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
   9061 ; GENERIC-LABEL: test_8xfloat_shuff_mem_mask3:
   9062 ; GENERIC:       # %bb.0:
   9063 ; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
   9064 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9065 ;
   9066 ; SKX-LABEL: test_8xfloat_shuff_mem_mask3:
   9067 ; SKX:       # %bb.0:
   9068 ; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
   9069 ; SKX-NEXT:    retq # sched: [7:1.00]
   9070   %vec2 = load <8 x float>, <8 x float>* %vec2p
   9071   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   9072   ret <8 x float> %res
   9073 }
   9074 define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   9075 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
   9076 ; GENERIC:       # %bb.0:
   9077 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9078 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   9079 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   9080 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9081 ;
   9082 ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
   9083 ; SKX:       # %bb.0:
   9084 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9085 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   9086 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   9087 ; SKX-NEXT:    retq # sched: [7:1.00]
   9088   %vec2 = load <8 x float>, <8 x float>* %vec2p
   9089   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   9090   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   9091   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   9092   ret <8 x float> %res
   9093 }
   9094 
   9095 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   9096 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
   9097 ; GENERIC:       # %bb.0:
   9098 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   9099 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   9100 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9101 ;
   9102 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
   9103 ; SKX:       # %bb.0:
   9104 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   9105 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   9106 ; SKX-NEXT:    retq # sched: [7:1.00]
   9107   %vec2 = load <8 x float>, <8 x float>* %vec2p
   9108   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   9109   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   9110   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   9111   ret <8 x float> %res
   9112 }
   9113 
   9114 define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   9115 ; GENERIC-LABEL: test_16xfloat_shuff_mask0:
   9116 ; GENERIC:       # %bb.0:
   9117 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
   9118 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9119 ;
   9120 ; SKX-LABEL: test_16xfloat_shuff_mask0:
   9121 ; SKX:       # %bb.0:
   9122 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
   9123 ; SKX-NEXT:    retq # sched: [7:1.00]
   9124   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   9125   ret <16 x float> %res
   9126 }
   9127 define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   9128 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0:
   9129 ; GENERIC:       # %bb.0:
   9130 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   9131 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
   9132 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   9133 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9134 ;
   9135 ; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
   9136 ; SKX:       # %bb.0:
   9137 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   9138 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
   9139 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   9140 ; SKX-NEXT:    retq # sched: [7:1.00]
   9141   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   9142   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9143   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   9144   ret <16 x float> %res
   9145 }
   9146 
   9147 define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   9148 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0:
   9149 ; GENERIC:       # %bb.0:
   9150 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9151 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
   9152 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9153 ;
   9154 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0:
   9155 ; SKX:       # %bb.0:
   9156 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9157 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
   9158 ; SKX-NEXT:    retq # sched: [7:1.00]
   9159   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   9160   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9161   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   9162   ret <16 x float> %res
   9163 }
   9164 define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   9165 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1:
   9166 ; GENERIC:       # %bb.0:
   9167 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   9168 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
   9169 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   9170 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9171 ;
   9172 ; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
   9173 ; SKX:       # %bb.0:
   9174 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   9175 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
   9176 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   9177 ; SKX-NEXT:    retq # sched: [7:1.00]
   9178   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
   9179   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9180   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   9181   ret <16 x float> %res
   9182 }
   9183 
   9184 define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   9185 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1:
   9186 ; GENERIC:       # %bb.0:
   9187 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9188 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
   9189 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9190 ;
   9191 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1:
   9192 ; SKX:       # %bb.0:
   9193 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9194 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
   9195 ; SKX-NEXT:    retq # sched: [7:1.00]
   9196   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
   9197   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9198   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   9199   ret <16 x float> %res
   9200 }
   9201 define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   9202 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2:
   9203 ; GENERIC:       # %bb.0:
   9204 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   9205 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
   9206 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   9207 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9208 ;
   9209 ; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
   9210 ; SKX:       # %bb.0:
   9211 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   9212 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
   9213 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   9214 ; SKX-NEXT:    retq # sched: [7:1.00]
   9215   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   9216   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9217   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   9218   ret <16 x float> %res
   9219 }
   9220 
   9221 define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   9222 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2:
   9223 ; GENERIC:       # %bb.0:
   9224 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9225 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
   9226 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9227 ;
   9228 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2:
   9229 ; SKX:       # %bb.0:
   9230 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9231 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
   9232 ; SKX-NEXT:    retq # sched: [7:1.00]
   9233   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   9234   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9235   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   9236   ret <16 x float> %res
   9237 }
   9238 define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
   9239 ; GENERIC-LABEL: test_16xfloat_shuff_mask3:
   9240 ; GENERIC:       # %bb.0:
   9241 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
   9242 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9243 ;
   9244 ; SKX-LABEL: test_16xfloat_shuff_mask3:
   9245 ; SKX:       # %bb.0:
   9246 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
   9247 ; SKX-NEXT:    retq # sched: [7:1.00]
   9248   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   9249   ret <16 x float> %res
   9250 }
   9251 define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   9252 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3:
   9253 ; GENERIC:       # %bb.0:
   9254 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   9255 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
   9256 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   9257 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9258 ;
   9259 ; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
   9260 ; SKX:       # %bb.0:
   9261 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   9262 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
   9263 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   9264 ; SKX-NEXT:    retq # sched: [7:1.00]
   9265   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   9266   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9267   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   9268   ret <16 x float> %res
   9269 }
   9270 
   9271 define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   9272 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3:
   9273 ; GENERIC:       # %bb.0:
   9274 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9275 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
   9276 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9277 ;
   9278 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3:
   9279 ; SKX:       # %bb.0:
   9280 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9281 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
   9282 ; SKX-NEXT:    retq # sched: [7:1.00]
   9283   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   9284   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9285   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   9286   ret <16 x float> %res
   9287 }
   9288 define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
   9289 ; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0:
   9290 ; GENERIC:       # %bb.0:
   9291 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
   9292 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9293 ;
   9294 ; SKX-LABEL: test_16xfloat_shuff_mem_mask0:
   9295 ; SKX:       # %bb.0:
   9296 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
   9297 ; SKX-NEXT:    retq # sched: [7:1.00]
   9298   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9299   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   9300   ret <16 x float> %res
   9301 }
   9302 define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   9303 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
   9304 ; GENERIC:       # %bb.0:
   9305 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9306 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
   9307 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   9308 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9309 ;
   9310 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
   9311 ; SKX:       # %bb.0:
   9312 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9313 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
   9314 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   9315 ; SKX-NEXT:    retq # sched: [7:1.00]
   9316   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9317   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   9318   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9319   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   9320   ret <16 x float> %res
   9321 }
   9322 
   9323 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   9324 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
   9325 ; GENERIC:       # %bb.0:
   9326 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   9327 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
   9328 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9329 ;
   9330 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
   9331 ; SKX:       # %bb.0:
   9332 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   9333 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
   9334 ; SKX-NEXT:    retq # sched: [7:1.00]
   9335   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9336   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   9337   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9338   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   9339   ret <16 x float> %res
   9340 }
   9341 
   9342 define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   9343 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
   9344 ; GENERIC:       # %bb.0:
   9345 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9346 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
   9347 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   9348 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9349 ;
   9350 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
   9351 ; SKX:       # %bb.0:
   9352 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9353 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
   9354 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   9355 ; SKX-NEXT:    retq # sched: [7:1.00]
   9356   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9357   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   9358   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9359   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   9360   ret <16 x float> %res
   9361 }
   9362 
   9363 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   9364 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
   9365 ; GENERIC:       # %bb.0:
   9366 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   9367 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
   9368 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9369 ;
   9370 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
   9371 ; SKX:       # %bb.0:
   9372 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   9373 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
   9374 ; SKX-NEXT:    retq # sched: [7:1.00]
   9375   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9376   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   9377   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9378   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   9379   ret <16 x float> %res
   9380 }
   9381 
   9382 define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   9383 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
   9384 ; GENERIC:       # %bb.0:
   9385 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9386 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
   9387 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   9388 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9389 ;
   9390 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
   9391 ; SKX:       # %bb.0:
   9392 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9393 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
   9394 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   9395 ; SKX-NEXT:    retq # sched: [7:1.00]
   9396   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9397   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
   9398   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9399   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   9400   ret <16 x float> %res
   9401 }
   9402 
   9403 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   9404 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
   9405 ; GENERIC:       # %bb.0:
   9406 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   9407 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
   9408 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9409 ;
   9410 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
   9411 ; SKX:       # %bb.0:
   9412 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   9413 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
   9414 ; SKX-NEXT:    retq # sched: [7:1.00]
   9415   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9416   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
   9417   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9418   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   9419   ret <16 x float> %res
   9420 }
   9421 
   9422 define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
   9423 ; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3:
   9424 ; GENERIC:       # %bb.0:
   9425 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
   9426 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9427 ;
   9428 ; SKX-LABEL: test_16xfloat_shuff_mem_mask3:
   9429 ; SKX:       # %bb.0:
   9430 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
   9431 ; SKX-NEXT:    retq # sched: [7:1.00]
   9432   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9433   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   9434   ret <16 x float> %res
   9435 }
   9436 define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   9437 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
   9438 ; GENERIC:       # %bb.0:
   9439 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9440 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
   9441 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   9442 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9443 ;
   9444 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
   9445 ; SKX:       # %bb.0:
   9446 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9447 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
   9448 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   9449 ; SKX-NEXT:    retq # sched: [7:1.00]
   9450   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9451   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   9452   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9453   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   9454   ret <16 x float> %res
   9455 }
   9456 
   9457 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   9458 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
   9459 ; GENERIC:       # %bb.0:
   9460 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   9461 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
   9462 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9463 ;
   9464 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
   9465 ; SKX:       # %bb.0:
   9466 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   9467 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
   9468 ; SKX-NEXT:    retq # sched: [7:1.00]
   9469   %vec2 = load <16 x float>, <16 x float>* %vec2p
   9470   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   9471   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   9472   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   9473   ret <16 x float> %res
   9474 }
   9475 
   9476 define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
   9477 ; GENERIC-LABEL: test_4xdouble_shuff_mask0:
   9478 ; GENERIC:       # %bb.0:
   9479 ; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   9480 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9481 ;
   9482 ; SKX-LABEL: test_4xdouble_shuff_mask0:
   9483 ; SKX:       # %bb.0:
   9484 ; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   9485 ; SKX-NEXT:    retq # sched: [7:1.00]
   9486   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9487   ret <4 x double> %res
   9488 }
   9489 define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   9490 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
   9491 ; GENERIC:       # %bb.0:
   9492 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   9493 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   9494 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   9495 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9496 ;
   9497 ; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
   9498 ; SKX:       # %bb.0:
   9499 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   9500 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   9501 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   9502 ; SKX-NEXT:    retq # sched: [7:1.00]
   9503   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9504   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9505   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   9506   ret <4 x double> %res
   9507 }
   9508 
   9509 define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   9510 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
   9511 ; GENERIC:       # %bb.0:
   9512 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9513 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   9514 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9515 ;
   9516 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
   9517 ; SKX:       # %bb.0:
   9518 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9519 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   9520 ; SKX-NEXT:    retq # sched: [7:1.00]
   9521   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9522   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9523   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   9524   ret <4 x double> %res
   9525 }
   9526 define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   9527 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
   9528 ; GENERIC:       # %bb.0:
   9529 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   9530 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   9531 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   9532 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9533 ;
   9534 ; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
   9535 ; SKX:       # %bb.0:
   9536 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   9537 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   9538 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   9539 ; SKX-NEXT:    retq # sched: [7:1.00]
   9540   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9541   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9542   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   9543   ret <4 x double> %res
   9544 }
   9545 
   9546 define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   9547 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
   9548 ; GENERIC:       # %bb.0:
   9549 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9550 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   9551 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9552 ;
   9553 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
   9554 ; SKX:       # %bb.0:
   9555 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9556 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   9557 ; SKX-NEXT:    retq # sched: [7:1.00]
   9558   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9559   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9560   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   9561   ret <4 x double> %res
   9562 }
   9563 define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   9564 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
   9565 ; GENERIC:       # %bb.0:
   9566 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   9567 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   9568 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   9569 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9570 ;
   9571 ; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
   9572 ; SKX:       # %bb.0:
   9573 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   9574 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   9575 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   9576 ; SKX-NEXT:    retq # sched: [7:1.00]
   9577   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9578   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9579   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   9580   ret <4 x double> %res
   9581 }
   9582 
   9583 define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   9584 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
   9585 ; GENERIC:       # %bb.0:
   9586 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9587 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   9588 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9589 ;
   9590 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
   9591 ; SKX:       # %bb.0:
   9592 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9593 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   9594 ; SKX-NEXT:    retq # sched: [7:1.00]
   9595   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9596   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9597   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   9598   ret <4 x double> %res
   9599 }
   9600 define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
   9601 ; GENERIC-LABEL: test_4xdouble_shuff_mask3:
   9602 ; GENERIC:       # %bb.0:
   9603 ; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   9604 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9605 ;
   9606 ; SKX-LABEL: test_4xdouble_shuff_mask3:
   9607 ; SKX:       # %bb.0:
   9608 ; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   9609 ; SKX-NEXT:    retq # sched: [7:1.00]
   9610   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9611   ret <4 x double> %res
   9612 }
   9613 define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   9614 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
   9615 ; GENERIC:       # %bb.0:
   9616 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   9617 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   9618 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   9619 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9620 ;
   9621 ; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
   9622 ; SKX:       # %bb.0:
   9623 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   9624 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   9625 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   9626 ; SKX-NEXT:    retq # sched: [7:1.00]
   9627   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9628   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9629   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   9630   ret <4 x double> %res
   9631 }
   9632 
   9633 define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   9634 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
   9635 ; GENERIC:       # %bb.0:
   9636 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9637 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   9638 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9639 ;
   9640 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
   9641 ; SKX:       # %bb.0:
   9642 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9643 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   9644 ; SKX-NEXT:    retq # sched: [7:1.00]
   9645   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9646   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9647   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   9648   ret <4 x double> %res
   9649 }
   9650 define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
   9651 ; GENERIC-LABEL: test_4xdouble_shuff_mem_mask0:
   9652 ; GENERIC:       # %bb.0:
   9653 ; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
   9654 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9655 ;
   9656 ; SKX-LABEL: test_4xdouble_shuff_mem_mask0:
   9657 ; SKX:       # %bb.0:
   9658 ; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
   9659 ; SKX-NEXT:    retq # sched: [7:1.00]
   9660   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9661   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9662   ret <4 x double> %res
   9663 }
   9664 define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   9665 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
   9666 ; GENERIC:       # %bb.0:
   9667 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9668 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
   9669 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   9670 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9671 ;
   9672 ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
   9673 ; SKX:       # %bb.0:
   9674 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9675 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
   9676 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   9677 ; SKX-NEXT:    retq # sched: [7:1.00]
   9678   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9679   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9680   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9681   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   9682   ret <4 x double> %res
   9683 }
   9684 
   9685 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   9686 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
   9687 ; GENERIC:       # %bb.0:
   9688 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   9689 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
   9690 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9691 ;
   9692 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
   9693 ; SKX:       # %bb.0:
   9694 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   9695 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
   9696 ; SKX-NEXT:    retq # sched: [7:1.00]
   9697   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9698   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9699   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9700   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   9701   ret <4 x double> %res
   9702 }
   9703 
   9704 define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   9705 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
   9706 ; GENERIC:       # %bb.0:
   9707 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9708 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
   9709 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   9710 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9711 ;
   9712 ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
   9713 ; SKX:       # %bb.0:
   9714 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9715 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
   9716 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   9717 ; SKX-NEXT:    retq # sched: [7:1.00]
   9718   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9719   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9720   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9721   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   9722   ret <4 x double> %res
   9723 }
   9724 
   9725 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   9726 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
   9727 ; GENERIC:       # %bb.0:
   9728 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   9729 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
   9730 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9731 ;
   9732 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
   9733 ; SKX:       # %bb.0:
   9734 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   9735 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
   9736 ; SKX-NEXT:    retq # sched: [7:1.00]
   9737   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9738   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9739   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9740   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   9741   ret <4 x double> %res
   9742 }
   9743 
   9744 define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   9745 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
   9746 ; GENERIC:       # %bb.0:
   9747 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9748 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
   9749 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   9750 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9751 ;
   9752 ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
   9753 ; SKX:       # %bb.0:
   9754 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9755 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
   9756 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   9757 ; SKX-NEXT:    retq # sched: [7:1.00]
   9758   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9759   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9760   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9761   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   9762   ret <4 x double> %res
   9763 }
   9764 
   9765 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   9766 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
   9767 ; GENERIC:       # %bb.0:
   9768 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   9769 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
   9770 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9771 ;
   9772 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
   9773 ; SKX:       # %bb.0:
   9774 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   9775 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
   9776 ; SKX-NEXT:    retq # sched: [7:1.00]
   9777   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9778   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   9779   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9780   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   9781   ret <4 x double> %res
   9782 }
   9783 
   9784 define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
   9785 ; GENERIC-LABEL: test_4xdouble_shuff_mem_mask3:
   9786 ; GENERIC:       # %bb.0:
   9787 ; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
   9788 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9789 ;
   9790 ; SKX-LABEL: test_4xdouble_shuff_mem_mask3:
   9791 ; SKX:       # %bb.0:
   9792 ; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
   9793 ; SKX-NEXT:    retq # sched: [7:1.00]
   9794   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9795   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9796   ret <4 x double> %res
   9797 }
   9798 define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   9799 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
   9800 ; GENERIC:       # %bb.0:
   9801 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   9802 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
   9803 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   9804 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9805 ;
   9806 ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
   9807 ; SKX:       # %bb.0:
   9808 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   9809 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
   9810 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   9811 ; SKX-NEXT:    retq # sched: [7:1.00]
   9812   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9813   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9814   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9815   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   9816   ret <4 x double> %res
   9817 }
   9818 
   9819 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   9820 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
   9821 ; GENERIC:       # %bb.0:
   9822 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   9823 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
   9824 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9825 ;
   9826 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
   9827 ; SKX:       # %bb.0:
   9828 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   9829 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
   9830 ; SKX-NEXT:    retq # sched: [7:1.00]
   9831   %vec2 = load <4 x double>, <4 x double>* %vec2p
   9832   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   9833   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   9834   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   9835   ret <4 x double> %res
   9836 }
   9837 
   9838 define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
   9839 ; GENERIC-LABEL: test_8xdouble_shuff_mask0:
   9840 ; GENERIC:       # %bb.0:
   9841 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
   9842 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9843 ;
   9844 ; SKX-LABEL: test_8xdouble_shuff_mask0:
   9845 ; SKX:       # %bb.0:
   9846 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
   9847 ; SKX-NEXT:    retq # sched: [7:1.00]
   9848   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   9849   ret <8 x double> %res
   9850 }
   9851 define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   9852 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0:
   9853 ; GENERIC:       # %bb.0:
   9854 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   9855 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
   9856 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   9857 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9858 ;
   9859 ; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
   9860 ; SKX:       # %bb.0:
   9861 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   9862 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
   9863 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   9864 ; SKX-NEXT:    retq # sched: [7:1.00]
   9865   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   9866   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   9867   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   9868   ret <8 x double> %res
   9869 }
   9870 
   9871 define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   9872 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0:
   9873 ; GENERIC:       # %bb.0:
   9874 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9875 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
   9876 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9877 ;
   9878 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0:
   9879 ; SKX:       # %bb.0:
   9880 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9881 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
   9882 ; SKX-NEXT:    retq # sched: [7:1.00]
   9883   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   9884   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   9885   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   9886   ret <8 x double> %res
   9887 }
   9888 define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   9889 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1:
   9890 ; GENERIC:       # %bb.0:
   9891 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   9892 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
   9893 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   9894 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9895 ;
   9896 ; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
   9897 ; SKX:       # %bb.0:
   9898 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   9899 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
   9900 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   9901 ; SKX-NEXT:    retq # sched: [7:1.00]
   9902   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   9903   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   9904   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   9905   ret <8 x double> %res
   9906 }
   9907 
   9908 define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   9909 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1:
   9910 ; GENERIC:       # %bb.0:
   9911 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9912 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
   9913 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9914 ;
   9915 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1:
   9916 ; SKX:       # %bb.0:
   9917 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9918 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
   9919 ; SKX-NEXT:    retq # sched: [7:1.00]
   9920   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   9921   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   9922   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   9923   ret <8 x double> %res
   9924 }
   9925 define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   9926 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2:
   9927 ; GENERIC:       # %bb.0:
   9928 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   9929 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
   9930 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   9931 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9932 ;
   9933 ; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
   9934 ; SKX:       # %bb.0:
   9935 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   9936 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
   9937 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   9938 ; SKX-NEXT:    retq # sched: [7:1.00]
   9939   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
   9940   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   9941   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   9942   ret <8 x double> %res
   9943 }
   9944 
   9945 define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   9946 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2:
   9947 ; GENERIC:       # %bb.0:
   9948 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9949 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
   9950 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9951 ;
   9952 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2:
   9953 ; SKX:       # %bb.0:
   9954 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   9955 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
   9956 ; SKX-NEXT:    retq # sched: [7:1.00]
   9957   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
   9958   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   9959   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   9960   ret <8 x double> %res
   9961 }
   9962 define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
   9963 ; GENERIC-LABEL: test_8xdouble_shuff_mask3:
   9964 ; GENERIC:       # %bb.0:
   9965 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
   9966 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9967 ;
   9968 ; SKX-LABEL: test_8xdouble_shuff_mask3:
   9969 ; SKX:       # %bb.0:
   9970 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
   9971 ; SKX-NEXT:    retq # sched: [7:1.00]
   9972   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   9973   ret <8 x double> %res
   9974 }
   9975 define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   9976 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3:
   9977 ; GENERIC:       # %bb.0:
   9978 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   9979 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
   9980 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   9981 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   9982 ;
   9983 ; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
   9984 ; SKX:       # %bb.0:
   9985 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   9986 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
   9987 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   9988 ; SKX-NEXT:    retq # sched: [7:1.00]
   9989   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   9990   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   9991   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   9992   ret <8 x double> %res
   9993 }
   9994 
   9995 define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   9996 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3:
   9997 ; GENERIC:       # %bb.0:
   9998 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   9999 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
   10000 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10001 ;
   10002 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3:
   10003 ; SKX:       # %bb.0:
   10004 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10005 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
   10006 ; SKX-NEXT:    retq # sched: [7:1.00]
   10007   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   10008   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10009   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   10010   ret <8 x double> %res
   10011 }
   10012 define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
   10013 ; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0:
   10014 ; GENERIC:       # %bb.0:
   10015 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
   10016 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10017 ;
   10018 ; SKX-LABEL: test_8xdouble_shuff_mem_mask0:
   10019 ; SKX:       # %bb.0:
   10020 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
   10021 ; SKX-NEXT:    retq # sched: [7:1.00]
   10022   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10023   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   10024   ret <8 x double> %res
   10025 }
   10026 define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   10027 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
   10028 ; GENERIC:       # %bb.0:
   10029 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10030 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
   10031 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   10032 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10033 ;
   10034 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
   10035 ; SKX:       # %bb.0:
   10036 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10037 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
   10038 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   10039 ; SKX-NEXT:    retq # sched: [7:1.00]
   10040   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10041   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   10042   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10043   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   10044   ret <8 x double> %res
   10045 }
   10046 
   10047 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   10048 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
   10049 ; GENERIC:       # %bb.0:
   10050 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   10051 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
   10052 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10053 ;
   10054 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
   10055 ; SKX:       # %bb.0:
   10056 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   10057 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
   10058 ; SKX-NEXT:    retq # sched: [7:1.00]
   10059   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10060   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   10061   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10062   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   10063   ret <8 x double> %res
   10064 }
   10065 
   10066 define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   10067 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
   10068 ; GENERIC:       # %bb.0:
   10069 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10070 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
   10071 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   10072 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10073 ;
   10074 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
   10075 ; SKX:       # %bb.0:
   10076 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10077 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
   10078 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   10079 ; SKX-NEXT:    retq # sched: [7:1.00]
   10080   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10081   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10082   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10083   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   10084   ret <8 x double> %res
   10085 }
   10086 
   10087 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   10088 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
   10089 ; GENERIC:       # %bb.0:
   10090 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   10091 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
   10092 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10093 ;
   10094 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
   10095 ; SKX:       # %bb.0:
   10096 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   10097 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
   10098 ; SKX-NEXT:    retq # sched: [7:1.00]
   10099   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10100   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10101   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10102   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   10103   ret <8 x double> %res
   10104 }
   10105 
   10106 define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   10107 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
   10108 ; GENERIC:       # %bb.0:
   10109 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10110 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
   10111 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   10112 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10113 ;
   10114 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
   10115 ; SKX:       # %bb.0:
   10116 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10117 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
   10118 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   10119 ; SKX-NEXT:    retq # sched: [7:1.00]
   10120   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10121   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
   10122   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10123   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   10124   ret <8 x double> %res
   10125 }
   10126 
   10127 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   10128 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
   10129 ; GENERIC:       # %bb.0:
   10130 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   10131 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
   10132 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10133 ;
   10134 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
   10135 ; SKX:       # %bb.0:
   10136 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   10137 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
   10138 ; SKX-NEXT:    retq # sched: [7:1.00]
   10139   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10140   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
   10141   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10142   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   10143   ret <8 x double> %res
   10144 }
   10145 
   10146 define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
   10147 ; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3:
   10148 ; GENERIC:       # %bb.0:
   10149 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
   10150 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10151 ;
   10152 ; SKX-LABEL: test_8xdouble_shuff_mem_mask3:
   10153 ; SKX:       # %bb.0:
   10154 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
   10155 ; SKX-NEXT:    retq # sched: [7:1.00]
   10156   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10157   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   10158   ret <8 x double> %res
   10159 }
   10160 define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   10161 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
   10162 ; GENERIC:       # %bb.0:
   10163 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10164 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
   10165 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   10166 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10167 ;
   10168 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
   10169 ; SKX:       # %bb.0:
   10170 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10171 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
   10172 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   10173 ; SKX-NEXT:    retq # sched: [7:1.00]
   10174   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10175   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   10176   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10177   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   10178   ret <8 x double> %res
   10179 }
   10180 
   10181 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   10182 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
   10183 ; GENERIC:       # %bb.0:
   10184 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   10185 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
   10186 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10187 ;
   10188 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
   10189 ; SKX:       # %bb.0:
   10190 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   10191 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
   10192 ; SKX-NEXT:    retq # sched: [7:1.00]
   10193   %vec2 = load <8 x double>, <8 x double>* %vec2p
   10194   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   10195   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   10196   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   10197   ret <8 x double> %res
   10198 }
   10199 
   10200 define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
   10201 ; GENERIC-LABEL: test_8xi32_shuff_mask0:
   10202 ; GENERIC:       # %bb.0:
   10203 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   10204 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10205 ;
   10206 ; SKX-LABEL: test_8xi32_shuff_mask0:
   10207 ; SKX:       # %bb.0:
   10208 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   10209 ; SKX-NEXT:    retq # sched: [7:1.00]
   10210   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   10211   ret <8 x i32> %res
   10212 }
   10213 define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
   10214 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
   10215 ; GENERIC:       # %bb.0:
   10216 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   10217 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
   10218 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   10219 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10220 ;
   10221 ; SKX-LABEL: test_8xi32_masked_shuff_mask0:
   10222 ; SKX:       # %bb.0:
   10223 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   10224 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
   10225 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   10226 ; SKX-NEXT:    retq # sched: [7:1.00]
   10227   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   10228   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10229   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   10230   ret <8 x i32> %res
   10231 }
   10232 
   10233 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
   10234 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
   10235 ; GENERIC:       # %bb.0:
   10236 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10237 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
   10238 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10239 ;
   10240 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
   10241 ; SKX:       # %bb.0:
   10242 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10243 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
   10244 ; SKX-NEXT:    retq # sched: [7:1.00]
   10245   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   10246   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10247   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   10248   ret <8 x i32> %res
   10249 }
   10250 define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
   10251 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
   10252 ; GENERIC:       # %bb.0:
   10253 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   10254 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   10255 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   10256 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10257 ;
   10258 ; SKX-LABEL: test_8xi32_masked_shuff_mask1:
   10259 ; SKX:       # %bb.0:
   10260 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   10261 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   10262 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   10263 ; SKX-NEXT:    retq # sched: [7:1.00]
   10264   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10265   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10266   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   10267   ret <8 x i32> %res
   10268 }
   10269 
   10270 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
   10271 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
   10272 ; GENERIC:       # %bb.0:
   10273 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10274 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   10275 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10276 ;
   10277 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
   10278 ; SKX:       # %bb.0:
   10279 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10280 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   10281 ; SKX-NEXT:    retq # sched: [7:1.00]
   10282   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10283   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10284   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   10285   ret <8 x i32> %res
   10286 }
   10287 define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
   10288 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
   10289 ; GENERIC:       # %bb.0:
   10290 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   10291 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
   10292 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   10293 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10294 ;
   10295 ; SKX-LABEL: test_8xi32_masked_shuff_mask2:
   10296 ; SKX:       # %bb.0:
   10297 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   10298 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
   10299 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   10300 ; SKX-NEXT:    retq # sched: [7:1.00]
   10301   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   10302   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10303   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   10304   ret <8 x i32> %res
   10305 }
   10306 
   10307 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
   10308 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
   10309 ; GENERIC:       # %bb.0:
   10310 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10311 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
   10312 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10313 ;
   10314 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
   10315 ; SKX:       # %bb.0:
   10316 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10317 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
   10318 ; SKX-NEXT:    retq # sched: [7:1.00]
   10319   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   10320   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10321   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   10322   ret <8 x i32> %res
   10323 }
   10324 define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
   10325 ; GENERIC-LABEL: test_8xi32_shuff_mask3:
   10326 ; GENERIC:       # %bb.0:
   10327 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   10328 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10329 ;
   10330 ; SKX-LABEL: test_8xi32_shuff_mask3:
   10331 ; SKX:       # %bb.0:
   10332 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   10333 ; SKX-NEXT:    retq # sched: [7:1.00]
   10334   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10335   ret <8 x i32> %res
   10336 }
   10337 define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
   10338 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
   10339 ; GENERIC:       # %bb.0:
   10340 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   10341 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   10342 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   10343 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10344 ;
   10345 ; SKX-LABEL: test_8xi32_masked_shuff_mask3:
   10346 ; SKX:       # %bb.0:
   10347 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   10348 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   10349 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   10350 ; SKX-NEXT:    retq # sched: [7:1.00]
   10351   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10352   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10353   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   10354   ret <8 x i32> %res
   10355 }
   10356 
   10357 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
   10358 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
   10359 ; GENERIC:       # %bb.0:
   10360 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10361 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
   10362 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10363 ;
   10364 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
   10365 ; SKX:       # %bb.0:
   10366 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10367 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
   10368 ; SKX-NEXT:    retq # sched: [7:1.00]
   10369   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10370   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10371   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   10372   ret <8 x i32> %res
   10373 }
   10374 define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
   10375 ; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
   10376 ; GENERIC:       # %bb.0:
   10377 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
   10378 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10379 ;
   10380 ; SKX-LABEL: test_8xi32_shuff_mem_mask0:
   10381 ; SKX:       # %bb.0:
   10382 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
   10383 ; SKX-NEXT:    retq # sched: [7:1.00]
   10384   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10385   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   10386   ret <8 x i32> %res
   10387 }
   10388 define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
   10389 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
   10390 ; GENERIC:       # %bb.0:
   10391 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10392 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
   10393 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   10394 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10395 ;
   10396 ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
   10397 ; SKX:       # %bb.0:
   10398 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10399 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
   10400 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   10401 ; SKX-NEXT:    retq # sched: [7:1.00]
   10402   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10403   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   10404   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10405   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   10406   ret <8 x i32> %res
   10407 }
   10408 
   10409 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
   10410 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
   10411 ; GENERIC:       # %bb.0:
   10412 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   10413 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
   10414 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10415 ;
   10416 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
   10417 ; SKX:       # %bb.0:
   10418 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   10419 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
   10420 ; SKX-NEXT:    retq # sched: [7:1.00]
   10421   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10422   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   10423   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10424   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   10425   ret <8 x i32> %res
   10426 }
   10427 
   10428 define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
   10429 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
   10430 ; GENERIC:       # %bb.0:
   10431 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10432 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   10433 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   10434 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10435 ;
   10436 ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
   10437 ; SKX:       # %bb.0:
   10438 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10439 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   10440 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   10441 ; SKX-NEXT:    retq # sched: [7:1.00]
   10442   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10443   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10444   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10445   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   10446   ret <8 x i32> %res
   10447 }
   10448 
   10449 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
   10450 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
   10451 ; GENERIC:       # %bb.0:
   10452 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   10453 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   10454 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10455 ;
   10456 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
   10457 ; SKX:       # %bb.0:
   10458 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   10459 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   10460 ; SKX-NEXT:    retq # sched: [7:1.00]
   10461   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10462   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10463   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10464   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   10465   ret <8 x i32> %res
   10466 }
   10467 
   10468 define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
   10469 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
   10470 ; GENERIC:       # %bb.0:
   10471 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10472 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   10473 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   10474 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10475 ;
   10476 ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
   10477 ; SKX:       # %bb.0:
   10478 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10479 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   10480 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   10481 ; SKX-NEXT:    retq # sched: [7:1.00]
   10482   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10483   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10484   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10485   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   10486   ret <8 x i32> %res
   10487 }
   10488 
   10489 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
   10490 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
   10491 ; GENERIC:       # %bb.0:
   10492 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   10493 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   10494 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10495 ;
   10496 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
   10497 ; SKX:       # %bb.0:
   10498 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   10499 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   10500 ; SKX-NEXT:    retq # sched: [7:1.00]
   10501   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10502   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10503   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10504   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   10505   ret <8 x i32> %res
   10506 }
   10507 
   10508 define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
   10509 ; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
   10510 ; GENERIC:       # %bb.0:
   10511 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
   10512 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10513 ;
   10514 ; SKX-LABEL: test_8xi32_shuff_mem_mask3:
   10515 ; SKX:       # %bb.0:
   10516 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
   10517 ; SKX-NEXT:    retq # sched: [7:1.00]
   10518   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10519   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10520   ret <8 x i32> %res
   10521 }
   10522 define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
   10523 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
   10524 ; GENERIC:       # %bb.0:
   10525 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10526 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   10527 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   10528 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10529 ;
   10530 ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
   10531 ; SKX:       # %bb.0:
   10532 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10533 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   10534 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   10535 ; SKX-NEXT:    retq # sched: [7:1.00]
   10536   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10537   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10538   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10539   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   10540   ret <8 x i32> %res
   10541 }
   10542 
   10543 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
   10544 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
   10545 ; GENERIC:       # %bb.0:
   10546 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   10547 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
   10548 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10549 ;
   10550 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
   10551 ; SKX:       # %bb.0:
   10552 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   10553 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
   10554 ; SKX-NEXT:    retq # sched: [7:1.00]
   10555   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   10556   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   10557   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   10558   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   10559   ret <8 x i32> %res
   10560 }
   10561 
   10562 define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
   10563 ; GENERIC-LABEL: test_16xi32_shuff_mask0:
   10564 ; GENERIC:       # %bb.0:
   10565 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
   10566 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10567 ;
   10568 ; SKX-LABEL: test_16xi32_shuff_mask0:
   10569 ; SKX:       # %bb.0:
   10570 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
   10571 ; SKX-NEXT:    retq # sched: [7:1.00]
   10572   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   10573   ret <16 x i32> %res
   10574 }
   10575 define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
   10576 ; GENERIC-LABEL: test_16xi32_masked_shuff_mask0:
   10577 ; GENERIC:       # %bb.0:
   10578 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   10579 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
   10580 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
   10581 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10582 ;
   10583 ; SKX-LABEL: test_16xi32_masked_shuff_mask0:
   10584 ; SKX:       # %bb.0:
   10585 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   10586 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
   10587 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
   10588 ; SKX-NEXT:    retq # sched: [7:1.00]
   10589   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   10590   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10591   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   10592   ret <16 x i32> %res
   10593 }
   10594 
   10595 define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
   10596 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0:
   10597 ; GENERIC:       # %bb.0:
   10598 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10599 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
   10600 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10601 ;
   10602 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0:
   10603 ; SKX:       # %bb.0:
   10604 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10605 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
   10606 ; SKX-NEXT:    retq # sched: [7:1.00]
   10607   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   10608   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10609   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   10610   ret <16 x i32> %res
   10611 }
   10612 define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
   10613 ; GENERIC-LABEL: test_16xi32_masked_shuff_mask1:
   10614 ; GENERIC:       # %bb.0:
   10615 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   10616 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
   10617 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
   10618 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10619 ;
   10620 ; SKX-LABEL: test_16xi32_masked_shuff_mask1:
   10621 ; SKX:       # %bb.0:
   10622 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   10623 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
   10624 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
   10625 ; SKX-NEXT:    retq # sched: [7:1.00]
   10626   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   10627   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10628   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   10629   ret <16 x i32> %res
   10630 }
   10631 
   10632 define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
   10633 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1:
   10634 ; GENERIC:       # %bb.0:
   10635 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10636 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
   10637 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10638 ;
   10639 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1:
   10640 ; SKX:       # %bb.0:
   10641 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10642 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
   10643 ; SKX-NEXT:    retq # sched: [7:1.00]
   10644   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   10645   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10646   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   10647   ret <16 x i32> %res
   10648 }
   10649 define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
   10650 ; GENERIC-LABEL: test_16xi32_masked_shuff_mask2:
   10651 ; GENERIC:       # %bb.0:
   10652 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   10653 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
   10654 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
   10655 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10656 ;
   10657 ; SKX-LABEL: test_16xi32_masked_shuff_mask2:
   10658 ; SKX:       # %bb.0:
   10659 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   10660 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
   10661 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
   10662 ; SKX-NEXT:    retq # sched: [7:1.00]
   10663   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   10664   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10665   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   10666   ret <16 x i32> %res
   10667 }
   10668 
   10669 define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
   10670 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2:
   10671 ; GENERIC:       # %bb.0:
   10672 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10673 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
   10674 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10675 ;
   10676 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2:
   10677 ; SKX:       # %bb.0:
   10678 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10679 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
   10680 ; SKX-NEXT:    retq # sched: [7:1.00]
   10681   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   10682   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10683   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   10684   ret <16 x i32> %res
   10685 }
   10686 define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
   10687 ; GENERIC-LABEL: test_16xi32_shuff_mask3:
   10688 ; GENERIC:       # %bb.0:
   10689 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
   10690 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10691 ;
   10692 ; SKX-LABEL: test_16xi32_shuff_mask3:
   10693 ; SKX:       # %bb.0:
   10694 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
   10695 ; SKX-NEXT:    retq # sched: [7:1.00]
   10696   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   10697   ret <16 x i32> %res
   10698 }
   10699 define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
   10700 ; GENERIC-LABEL: test_16xi32_masked_shuff_mask3:
   10701 ; GENERIC:       # %bb.0:
   10702 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   10703 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
   10704 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
   10705 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10706 ;
   10707 ; SKX-LABEL: test_16xi32_masked_shuff_mask3:
   10708 ; SKX:       # %bb.0:
   10709 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   10710 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
   10711 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
   10712 ; SKX-NEXT:    retq # sched: [7:1.00]
   10713   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   10714   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10715   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   10716   ret <16 x i32> %res
   10717 }
   10718 
   10719 define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
   10720 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3:
   10721 ; GENERIC:       # %bb.0:
   10722 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10723 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
   10724 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10725 ;
   10726 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3:
   10727 ; SKX:       # %bb.0:
   10728 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10729 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
   10730 ; SKX-NEXT:    retq # sched: [7:1.00]
   10731   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   10732   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10733   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   10734   ret <16 x i32> %res
   10735 }
   10736 define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
   10737 ; GENERIC-LABEL: test_16xi32_shuff_mem_mask0:
   10738 ; GENERIC:       # %bb.0:
   10739 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
   10740 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10741 ;
   10742 ; SKX-LABEL: test_16xi32_shuff_mem_mask0:
   10743 ; SKX:       # %bb.0:
   10744 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
   10745 ; SKX-NEXT:    retq # sched: [7:1.00]
   10746   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10747   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   10748   ret <16 x i32> %res
   10749 }
   10750 define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
   10751 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
   10752 ; GENERIC:       # %bb.0:
   10753 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10754 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
   10755 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   10756 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10757 ;
   10758 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
   10759 ; SKX:       # %bb.0:
   10760 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10761 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
   10762 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   10763 ; SKX-NEXT:    retq # sched: [7:1.00]
   10764   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10765   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   10766   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10767   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   10768   ret <16 x i32> %res
   10769 }
   10770 
   10771 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
   10772 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
   10773 ; GENERIC:       # %bb.0:
   10774 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   10775 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
   10776 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10777 ;
   10778 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
   10779 ; SKX:       # %bb.0:
   10780 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   10781 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
   10782 ; SKX-NEXT:    retq # sched: [7:1.00]
   10783   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10784   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   10785   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10786   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   10787   ret <16 x i32> %res
   10788 }
   10789 
   10790 define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
   10791 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
   10792 ; GENERIC:       # %bb.0:
   10793 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10794 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
   10795 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   10796 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10797 ;
   10798 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
   10799 ; SKX:       # %bb.0:
   10800 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10801 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
   10802 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   10803 ; SKX-NEXT:    retq # sched: [7:1.00]
   10804   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10805   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   10806   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10807   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   10808   ret <16 x i32> %res
   10809 }
   10810 
   10811 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
   10812 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
   10813 ; GENERIC:       # %bb.0:
   10814 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   10815 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
   10816 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10817 ;
   10818 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
   10819 ; SKX:       # %bb.0:
   10820 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   10821 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
   10822 ; SKX-NEXT:    retq # sched: [7:1.00]
   10823   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10824   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   10825   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10826   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   10827   ret <16 x i32> %res
   10828 }
   10829 
   10830 define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
   10831 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
   10832 ; GENERIC:       # %bb.0:
   10833 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10834 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
   10835 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   10836 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10837 ;
   10838 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
   10839 ; SKX:       # %bb.0:
   10840 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10841 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
   10842 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   10843 ; SKX-NEXT:    retq # sched: [7:1.00]
   10844   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10845   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   10846   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10847   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   10848   ret <16 x i32> %res
   10849 }
   10850 
   10851 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
   10852 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
   10853 ; GENERIC:       # %bb.0:
   10854 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   10855 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
   10856 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10857 ;
   10858 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
   10859 ; SKX:       # %bb.0:
   10860 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   10861 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
   10862 ; SKX-NEXT:    retq # sched: [7:1.00]
   10863   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10864   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   10865   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10866   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   10867   ret <16 x i32> %res
   10868 }
   10869 
   10870 define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
   10871 ; GENERIC-LABEL: test_16xi32_shuff_mem_mask3:
   10872 ; GENERIC:       # %bb.0:
   10873 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
   10874 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10875 ;
   10876 ; SKX-LABEL: test_16xi32_shuff_mem_mask3:
   10877 ; SKX:       # %bb.0:
   10878 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
   10879 ; SKX-NEXT:    retq # sched: [7:1.00]
   10880   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10881   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   10882   ret <16 x i32> %res
   10883 }
   10884 define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
   10885 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
   10886 ; GENERIC:       # %bb.0:
   10887 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   10888 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
   10889 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   10890 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10891 ;
   10892 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
   10893 ; SKX:       # %bb.0:
   10894 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   10895 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
   10896 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   10897 ; SKX-NEXT:    retq # sched: [7:1.00]
   10898   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10899   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   10900   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10901   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   10902   ret <16 x i32> %res
   10903 }
   10904 
   10905 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
   10906 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
   10907 ; GENERIC:       # %bb.0:
   10908 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   10909 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
   10910 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10911 ;
   10912 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
   10913 ; SKX:       # %bb.0:
   10914 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   10915 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
   10916 ; SKX-NEXT:    retq # sched: [7:1.00]
   10917   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   10918   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   10919   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   10920   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   10921   ret <16 x i32> %res
   10922 }
   10923 
   10924 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
   10925 ; GENERIC-LABEL: test_4xi64_shuff_mask0:
   10926 ; GENERIC:       # %bb.0:
   10927 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   10928 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10929 ;
   10930 ; SKX-LABEL: test_4xi64_shuff_mask0:
   10931 ; SKX:       # %bb.0:
   10932 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   10933 ; SKX-NEXT:    retq # sched: [7:1.00]
   10934   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   10935   ret <4 x i64> %res
   10936 }
   10937 define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
   10938 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
   10939 ; GENERIC:       # %bb.0:
   10940 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   10941 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   10942 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   10943 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10944 ;
   10945 ; SKX-LABEL: test_4xi64_masked_shuff_mask0:
   10946 ; SKX:       # %bb.0:
   10947 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   10948 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   10949 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   10950 ; SKX-NEXT:    retq # sched: [7:1.00]
   10951   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   10952   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   10953   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   10954   ret <4 x i64> %res
   10955 }
   10956 
   10957 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
   10958 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
   10959 ; GENERIC:       # %bb.0:
   10960 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10961 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   10962 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10963 ;
   10964 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
   10965 ; SKX:       # %bb.0:
   10966 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   10967 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   10968 ; SKX-NEXT:    retq # sched: [7:1.00]
   10969   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   10970   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   10971   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   10972   ret <4 x i64> %res
   10973 }
   10974 define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
   10975 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
   10976 ; GENERIC:       # %bb.0:
   10977 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   10978 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   10979 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   10980 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   10981 ;
   10982 ; SKX-LABEL: test_4xi64_masked_shuff_mask1:
   10983 ; SKX:       # %bb.0:
   10984 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   10985 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   10986 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   10987 ; SKX-NEXT:    retq # sched: [7:1.00]
   10988   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   10989   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   10990   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   10991   ret <4 x i64> %res
   10992 }
   10993 
   10994 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
   10995 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
   10996 ; GENERIC:       # %bb.0:
   10997 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   10998 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   10999 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11000 ;
   11001 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
   11002 ; SKX:       # %bb.0:
   11003 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   11004 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   11005 ; SKX-NEXT:    retq # sched: [7:1.00]
   11006   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11007   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11008   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   11009   ret <4 x i64> %res
   11010 }
   11011 define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
   11012 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
   11013 ; GENERIC:       # %bb.0:
   11014 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   11015 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   11016 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   11017 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11018 ;
   11019 ; SKX-LABEL: test_4xi64_masked_shuff_mask2:
   11020 ; SKX:       # %bb.0:
   11021 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   11022 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   11023 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   11024 ; SKX-NEXT:    retq # sched: [7:1.00]
   11025   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   11026   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11027   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   11028   ret <4 x i64> %res
   11029 }
   11030 
   11031 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
   11032 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
   11033 ; GENERIC:       # %bb.0:
   11034 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   11035 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   11036 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11037 ;
   11038 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
   11039 ; SKX:       # %bb.0:
   11040 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   11041 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   11042 ; SKX-NEXT:    retq # sched: [7:1.00]
   11043   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   11044   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11045   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   11046   ret <4 x i64> %res
   11047 }
   11048 define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
   11049 ; GENERIC-LABEL: test_4xi64_shuff_mask3:
   11050 ; GENERIC:       # %bb.0:
   11051 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   11052 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11053 ;
   11054 ; SKX-LABEL: test_4xi64_shuff_mask3:
   11055 ; SKX:       # %bb.0:
   11056 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   11057 ; SKX-NEXT:    retq # sched: [7:1.00]
   11058   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11059   ret <4 x i64> %res
   11060 }
   11061 define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
   11062 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
   11063 ; GENERIC:       # %bb.0:
   11064 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   11065 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   11066 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   11067 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11068 ;
   11069 ; SKX-LABEL: test_4xi64_masked_shuff_mask3:
   11070 ; SKX:       # %bb.0:
   11071 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   11072 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   11073 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   11074 ; SKX-NEXT:    retq # sched: [7:1.00]
   11075   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11076   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11077   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   11078   ret <4 x i64> %res
   11079 }
   11080 
   11081 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
   11082 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
   11083 ; GENERIC:       # %bb.0:
   11084 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   11085 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
   11086 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11087 ;
   11088 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
   11089 ; SKX:       # %bb.0:
   11090 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   11091 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
   11092 ; SKX-NEXT:    retq # sched: [7:1.00]
   11093   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11094   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11095   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   11096   ret <4 x i64> %res
   11097 }
   11098 define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
   11099 ; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
   11100 ; GENERIC:       # %bb.0:
   11101 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
   11102 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11103 ;
   11104 ; SKX-LABEL: test_4xi64_shuff_mem_mask0:
   11105 ; SKX:       # %bb.0:
   11106 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
   11107 ; SKX-NEXT:    retq # sched: [7:1.00]
   11108   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11109   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11110   ret <4 x i64> %res
   11111 }
   11112 define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
   11113 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
   11114 ; GENERIC:       # %bb.0:
   11115 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   11116 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
   11117 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   11118 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11119 ;
   11120 ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
   11121 ; SKX:       # %bb.0:
   11122 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   11123 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
   11124 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   11125 ; SKX-NEXT:    retq # sched: [7:1.00]
   11126   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11127   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11128   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11129   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   11130   ret <4 x i64> %res
   11131 }
   11132 
   11133 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
   11134 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
   11135 ; GENERIC:       # %bb.0:
   11136 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   11137 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
   11138 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11139 ;
   11140 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
   11141 ; SKX:       # %bb.0:
   11142 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   11143 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
   11144 ; SKX-NEXT:    retq # sched: [7:1.00]
   11145   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11146   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11147   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11148   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   11149   ret <4 x i64> %res
   11150 }
   11151 
   11152 define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
   11153 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
   11154 ; GENERIC:       # %bb.0:
   11155 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   11156 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
   11157 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   11158 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11159 ;
   11160 ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
   11161 ; SKX:       # %bb.0:
   11162 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   11163 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
   11164 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   11165 ; SKX-NEXT:    retq # sched: [7:1.00]
   11166   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11167   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   11168   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11169   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   11170   ret <4 x i64> %res
   11171 }
   11172 
   11173 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
   11174 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
   11175 ; GENERIC:       # %bb.0:
   11176 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   11177 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
   11178 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11179 ;
   11180 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
   11181 ; SKX:       # %bb.0:
   11182 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   11183 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
   11184 ; SKX-NEXT:    retq # sched: [7:1.00]
   11185   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11186   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   11187   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11188   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   11189   ret <4 x i64> %res
   11190 }
   11191 
   11192 define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
   11193 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
   11194 ; GENERIC:       # %bb.0:
   11195 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   11196 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
   11197 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   11198 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11199 ;
   11200 ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
   11201 ; SKX:       # %bb.0:
   11202 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   11203 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
   11204 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   11205 ; SKX-NEXT:    retq # sched: [7:1.00]
   11206   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11207   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   11208   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11209   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   11210   ret <4 x i64> %res
   11211 }
   11212 
   11213 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
   11214 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
   11215 ; GENERIC:       # %bb.0:
   11216 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   11217 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
   11218 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11219 ;
   11220 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
   11221 ; SKX:       # %bb.0:
   11222 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   11223 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
   11224 ; SKX-NEXT:    retq # sched: [7:1.00]
   11225   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11226   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   11227   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11228   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   11229   ret <4 x i64> %res
   11230 }
   11231 
   11232 define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
   11233 ; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
   11234 ; GENERIC:       # %bb.0:
   11235 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
   11236 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11237 ;
   11238 ; SKX-LABEL: test_4xi64_shuff_mem_mask3:
   11239 ; SKX:       # %bb.0:
   11240 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
   11241 ; SKX-NEXT:    retq # sched: [7:1.00]
   11242   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11243   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11244   ret <4 x i64> %res
   11245 }
   11246 define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
   11247 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
   11248 ; GENERIC:       # %bb.0:
   11249 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   11250 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
   11251 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
   11252 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11253 ;
   11254 ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
   11255 ; SKX:       # %bb.0:
   11256 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   11257 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
   11258 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
   11259 ; SKX-NEXT:    retq # sched: [7:1.00]
   11260   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11261   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11262   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11263   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   11264   ret <4 x i64> %res
   11265 }
   11266 
   11267 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
   11268 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
   11269 ; GENERIC:       # %bb.0:
   11270 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   11271 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
   11272 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11273 ;
   11274 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
   11275 ; SKX:       # %bb.0:
   11276 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   11277 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
   11278 ; SKX-NEXT:    retq # sched: [7:1.00]
   11279   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   11280   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   11281   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   11282   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   11283   ret <4 x i64> %res
   11284 }
   11285 
   11286 define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
   11287 ; GENERIC-LABEL: test_8xi64_shuff_mask0:
   11288 ; GENERIC:       # %bb.0:
   11289 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
   11290 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11291 ;
   11292 ; SKX-LABEL: test_8xi64_shuff_mask0:
   11293 ; SKX:       # %bb.0:
   11294 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
   11295 ; SKX-NEXT:    retq # sched: [7:1.00]
   11296   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   11297   ret <8 x i64> %res
   11298 }
   11299 define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
   11300 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask0:
   11301 ; GENERIC:       # %bb.0:
   11302 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   11303 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
   11304 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
   11305 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11306 ;
   11307 ; SKX-LABEL: test_8xi64_masked_shuff_mask0:
   11308 ; SKX:       # %bb.0:
   11309 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   11310 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
   11311 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
   11312 ; SKX-NEXT:    retq # sched: [7:1.00]
   11313   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   11314   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11315   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   11316   ret <8 x i64> %res
   11317 }
   11318 
   11319 define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
   11320 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0:
   11321 ; GENERIC:       # %bb.0:
   11322 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   11323 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
   11324 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11325 ;
   11326 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0:
   11327 ; SKX:       # %bb.0:
   11328 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   11329 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
   11330 ; SKX-NEXT:    retq # sched: [7:1.00]
   11331   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   11332   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11333   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   11334   ret <8 x i64> %res
   11335 }
   11336 define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
   11337 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask1:
   11338 ; GENERIC:       # %bb.0:
   11339 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   11340 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
   11341 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
   11342 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11343 ;
   11344 ; SKX-LABEL: test_8xi64_masked_shuff_mask1:
   11345 ; SKX:       # %bb.0:
   11346 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   11347 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
   11348 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
   11349 ; SKX-NEXT:    retq # sched: [7:1.00]
   11350   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   11351   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11352   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   11353   ret <8 x i64> %res
   11354 }
   11355 
   11356 define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
   11357 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1:
   11358 ; GENERIC:       # %bb.0:
   11359 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   11360 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
   11361 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11362 ;
   11363 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1:
   11364 ; SKX:       # %bb.0:
   11365 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   11366 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
   11367 ; SKX-NEXT:    retq # sched: [7:1.00]
   11368   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   11369   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11370   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   11371   ret <8 x i64> %res
   11372 }
   11373 define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
   11374 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask2:
   11375 ; GENERIC:       # %bb.0:
   11376 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   11377 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
   11378 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
   11379 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11380 ;
   11381 ; SKX-LABEL: test_8xi64_masked_shuff_mask2:
   11382 ; SKX:       # %bb.0:
   11383 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   11384 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
   11385 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
   11386 ; SKX-NEXT:    retq # sched: [7:1.00]
   11387   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   11388   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11389   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   11390   ret <8 x i64> %res
   11391 }
   11392 
   11393 define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
   11394 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2:
   11395 ; GENERIC:       # %bb.0:
   11396 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   11397 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
   11398 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11399 ;
   11400 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2:
   11401 ; SKX:       # %bb.0:
   11402 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   11403 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
   11404 ; SKX-NEXT:    retq # sched: [7:1.00]
   11405   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   11406   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11407   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   11408   ret <8 x i64> %res
   11409 }
   11410 define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
   11411 ; GENERIC-LABEL: test_8xi64_shuff_mask3:
   11412 ; GENERIC:       # %bb.0:
   11413 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
   11414 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11415 ;
   11416 ; SKX-LABEL: test_8xi64_shuff_mask3:
   11417 ; SKX:       # %bb.0:
   11418 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
   11419 ; SKX-NEXT:    retq # sched: [7:1.00]
   11420   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   11421   ret <8 x i64> %res
   11422 }
   11423 define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
   11424 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask3:
   11425 ; GENERIC:       # %bb.0:
   11426 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   11427 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
   11428 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
   11429 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11430 ;
   11431 ; SKX-LABEL: test_8xi64_masked_shuff_mask3:
   11432 ; SKX:       # %bb.0:
   11433 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   11434 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
   11435 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
   11436 ; SKX-NEXT:    retq # sched: [7:1.00]
   11437   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   11438   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11439   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   11440   ret <8 x i64> %res
   11441 }
   11442 
   11443 define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
   11444 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3:
   11445 ; GENERIC:       # %bb.0:
   11446 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   11447 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
   11448 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11449 ;
   11450 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3:
   11451 ; SKX:       # %bb.0:
   11452 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   11453 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
   11454 ; SKX-NEXT:    retq # sched: [7:1.00]
   11455   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   11456   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11457   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   11458   ret <8 x i64> %res
   11459 }
   11460 define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
   11461 ; GENERIC-LABEL: test_8xi64_shuff_mem_mask0:
   11462 ; GENERIC:       # %bb.0:
   11463 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
   11464 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11465 ;
   11466 ; SKX-LABEL: test_8xi64_shuff_mem_mask0:
   11467 ; SKX:       # %bb.0:
   11468 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
   11469 ; SKX-NEXT:    retq # sched: [7:1.00]
   11470   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11471   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   11472   ret <8 x i64> %res
   11473 }
   11474 define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
   11475 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
   11476 ; GENERIC:       # %bb.0:
   11477 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   11478 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
   11479 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   11480 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11481 ;
   11482 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
   11483 ; SKX:       # %bb.0:
   11484 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   11485 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
   11486 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   11487 ; SKX-NEXT:    retq # sched: [7:1.00]
   11488   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11489   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   11490   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11491   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   11492   ret <8 x i64> %res
   11493 }
   11494 
   11495 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
   11496 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
   11497 ; GENERIC:       # %bb.0:
   11498 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   11499 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
   11500 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11501 ;
   11502 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
   11503 ; SKX:       # %bb.0:
   11504 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   11505 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
   11506 ; SKX-NEXT:    retq # sched: [7:1.00]
   11507   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11508   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   11509   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11510   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   11511   ret <8 x i64> %res
   11512 }
   11513 
   11514 define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
   11515 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
   11516 ; GENERIC:       # %bb.0:
   11517 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   11518 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
   11519 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   11520 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11521 ;
   11522 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
   11523 ; SKX:       # %bb.0:
   11524 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   11525 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
   11526 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   11527 ; SKX-NEXT:    retq # sched: [7:1.00]
   11528   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11529   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   11530   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11531   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   11532   ret <8 x i64> %res
   11533 }
   11534 
   11535 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
   11536 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
   11537 ; GENERIC:       # %bb.0:
   11538 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   11539 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
   11540 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11541 ;
   11542 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
   11543 ; SKX:       # %bb.0:
   11544 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   11545 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
   11546 ; SKX-NEXT:    retq # sched: [7:1.00]
   11547   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11548   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   11549   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11550   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   11551   ret <8 x i64> %res
   11552 }
   11553 
   11554 define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
   11555 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
   11556 ; GENERIC:       # %bb.0:
   11557 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   11558 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
   11559 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   11560 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11561 ;
   11562 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
   11563 ; SKX:       # %bb.0:
   11564 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   11565 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
   11566 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   11567 ; SKX-NEXT:    retq # sched: [7:1.00]
   11568   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11569   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
   11570   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11571   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   11572   ret <8 x i64> %res
   11573 }
   11574 
   11575 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
   11576 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
   11577 ; GENERIC:       # %bb.0:
   11578 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   11579 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
   11580 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11581 ;
   11582 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
   11583 ; SKX:       # %bb.0:
   11584 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   11585 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
   11586 ; SKX-NEXT:    retq # sched: [7:1.00]
   11587   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11588   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
   11589   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11590   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   11591   ret <8 x i64> %res
   11592 }
   11593 
   11594 define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
   11595 ; GENERIC-LABEL: test_8xi64_shuff_mem_mask3:
   11596 ; GENERIC:       # %bb.0:
   11597 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
   11598 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11599 ;
   11600 ; SKX-LABEL: test_8xi64_shuff_mem_mask3:
   11601 ; SKX:       # %bb.0:
   11602 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
   11603 ; SKX-NEXT:    retq # sched: [7:1.00]
   11604   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11605   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   11606   ret <8 x i64> %res
   11607 }
   11608 define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
   11609 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
   11610 ; GENERIC:       # %bb.0:
   11611 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   11612 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
   11613 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
   11614 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11615 ;
   11616 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
   11617 ; SKX:       # %bb.0:
   11618 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   11619 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
   11620 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
   11621 ; SKX-NEXT:    retq # sched: [7:1.00]
   11622   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11623   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   11624   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11625   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   11626   ret <8 x i64> %res
   11627 }
   11628 
   11629 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
   11630 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
   11631 ; GENERIC:       # %bb.0:
   11632 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   11633 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
   11634 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11635 ;
   11636 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
   11637 ; SKX:       # %bb.0:
   11638 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   11639 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
   11640 ; SKX-NEXT:    retq # sched: [7:1.00]
   11641   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   11642   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   11643   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   11644   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   11645   ret <8 x i64> %res
   11646 }
   11647 
   11648 define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
   11649 ; GENERIC-LABEL: test_4xfloat_unpack_low_mask0:
   11650 ; GENERIC:       # %bb.0:
   11651 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11652 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11653 ;
   11654 ; SKX-LABEL: test_4xfloat_unpack_low_mask0:
   11655 ; SKX:       # %bb.0:
   11656 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11657 ; SKX-NEXT:    retq # sched: [7:1.00]
   11658   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11659   ret <4 x float> %res
   11660 }
   11661 define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
   11662 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0:
   11663 ; GENERIC:       # %bb.0:
   11664 ; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
   11665 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11666 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
   11667 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11668 ;
   11669 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0:
   11670 ; SKX:       # %bb.0:
   11671 ; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
   11672 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11673 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
   11674 ; SKX-NEXT:    retq # sched: [7:1.00]
   11675   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11676   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11677   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   11678   ret <4 x float> %res
   11679 }
   11680 
   11681 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
   11682 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
   11683 ; GENERIC:       # %bb.0:
   11684 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   11685 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11686 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11687 ;
   11688 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
   11689 ; SKX:       # %bb.0:
   11690 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   11691 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11692 ; SKX-NEXT:    retq # sched: [7:1.00]
   11693   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11694   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11695   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   11696   ret <4 x float> %res
   11697 }
   11698 define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
   11699 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1:
   11700 ; GENERIC:       # %bb.0:
   11701 ; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
   11702 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11703 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
   11704 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11705 ;
   11706 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1:
   11707 ; SKX:       # %bb.0:
   11708 ; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
   11709 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11710 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
   11711 ; SKX-NEXT:    retq # sched: [7:1.00]
   11712   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11713   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11714   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   11715   ret <4 x float> %res
   11716 }
   11717 
   11718 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
   11719 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
   11720 ; GENERIC:       # %bb.0:
   11721 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   11722 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11723 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11724 ;
   11725 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
   11726 ; SKX:       # %bb.0:
   11727 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   11728 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11729 ; SKX-NEXT:    retq # sched: [7:1.00]
   11730   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11731   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11732   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   11733   ret <4 x float> %res
   11734 }
   11735 define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
   11736 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2:
   11737 ; GENERIC:       # %bb.0:
   11738 ; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
   11739 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11740 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
   11741 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11742 ;
   11743 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2:
   11744 ; SKX:       # %bb.0:
   11745 ; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
   11746 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11747 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
   11748 ; SKX-NEXT:    retq # sched: [7:1.00]
   11749   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11750   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11751   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   11752   ret <4 x float> %res
   11753 }
   11754 
   11755 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
   11756 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
   11757 ; GENERIC:       # %bb.0:
   11758 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   11759 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11760 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11761 ;
   11762 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
   11763 ; SKX:       # %bb.0:
   11764 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   11765 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11766 ; SKX-NEXT:    retq # sched: [7:1.00]
   11767   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11768   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11769   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   11770   ret <4 x float> %res
   11771 }
   11772 define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) {
   11773 ; GENERIC-LABEL: test_4xfloat_unpack_low_mask3:
   11774 ; GENERIC:       # %bb.0:
   11775 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11776 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11777 ;
   11778 ; SKX-LABEL: test_4xfloat_unpack_low_mask3:
   11779 ; SKX:       # %bb.0:
   11780 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11781 ; SKX-NEXT:    retq # sched: [7:1.00]
   11782   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11783   ret <4 x float> %res
   11784 }
   11785 define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
   11786 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3:
   11787 ; GENERIC:       # %bb.0:
   11788 ; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
   11789 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11790 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
   11791 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11792 ;
   11793 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3:
   11794 ; SKX:       # %bb.0:
   11795 ; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
   11796 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11797 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
   11798 ; SKX-NEXT:    retq # sched: [7:1.00]
   11799   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11800   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11801   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   11802   ret <4 x float> %res
   11803 }
   11804 
   11805 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
   11806 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
   11807 ; GENERIC:       # %bb.0:
   11808 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   11809 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11810 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11811 ;
   11812 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
   11813 ; SKX:       # %bb.0:
   11814 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   11815 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
   11816 ; SKX-NEXT:    retq # sched: [7:1.00]
   11817   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11818   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11819   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   11820   ret <4 x float> %res
   11821 }
   11822 define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
   11823 ; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask0:
   11824 ; GENERIC:       # %bb.0:
   11825 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11826 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11827 ;
   11828 ; SKX-LABEL: test_4xfloat_unpack_low_mem_mask0:
   11829 ; SKX:       # %bb.0:
   11830 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11831 ; SKX-NEXT:    retq # sched: [7:1.00]
   11832   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11833   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11834   ret <4 x float> %res
   11835 }
   11836 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
   11837 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
   11838 ; GENERIC:       # %bb.0:
   11839 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   11840 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11841 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
   11842 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11843 ;
   11844 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
   11845 ; SKX:       # %bb.0:
   11846 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   11847 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11848 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
   11849 ; SKX-NEXT:    retq # sched: [7:1.00]
   11850   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11851   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11852   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11853   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   11854   ret <4 x float> %res
   11855 }
   11856 
   11857 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
   11858 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
   11859 ; GENERIC:       # %bb.0:
   11860 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   11861 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11862 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11863 ;
   11864 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
   11865 ; SKX:       # %bb.0:
   11866 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   11867 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11868 ; SKX-NEXT:    retq # sched: [7:1.00]
   11869   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11870   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11871   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11872   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   11873   ret <4 x float> %res
   11874 }
   11875 
   11876 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
   11877 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
   11878 ; GENERIC:       # %bb.0:
   11879 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   11880 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11881 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
   11882 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11883 ;
   11884 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
   11885 ; SKX:       # %bb.0:
   11886 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   11887 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11888 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
   11889 ; SKX-NEXT:    retq # sched: [7:1.00]
   11890   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11891   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11892   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11893   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   11894   ret <4 x float> %res
   11895 }
   11896 
   11897 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
   11898 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
   11899 ; GENERIC:       # %bb.0:
   11900 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   11901 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11902 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11903 ;
   11904 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
   11905 ; SKX:       # %bb.0:
   11906 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   11907 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11908 ; SKX-NEXT:    retq # sched: [7:1.00]
   11909   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11910   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11911   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11912   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   11913   ret <4 x float> %res
   11914 }
   11915 
   11916 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
   11917 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
   11918 ; GENERIC:       # %bb.0:
   11919 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   11920 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11921 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
   11922 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11923 ;
   11924 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
   11925 ; SKX:       # %bb.0:
   11926 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   11927 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11928 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
   11929 ; SKX-NEXT:    retq # sched: [7:1.00]
   11930   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11931   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11932   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11933   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   11934   ret <4 x float> %res
   11935 }
   11936 
   11937 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
   11938 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
   11939 ; GENERIC:       # %bb.0:
   11940 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   11941 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11942 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11943 ;
   11944 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
   11945 ; SKX:       # %bb.0:
   11946 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   11947 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11948 ; SKX-NEXT:    retq # sched: [7:1.00]
   11949   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11950   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11951   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11952   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   11953   ret <4 x float> %res
   11954 }
   11955 
   11956 define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
   11957 ; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask3:
   11958 ; GENERIC:       # %bb.0:
   11959 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11960 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11961 ;
   11962 ; SKX-LABEL: test_4xfloat_unpack_low_mem_mask3:
   11963 ; SKX:       # %bb.0:
   11964 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11965 ; SKX-NEXT:    retq # sched: [7:1.00]
   11966   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11967   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11968   ret <4 x float> %res
   11969 }
   11970 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
   11971 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
   11972 ; GENERIC:       # %bb.0:
   11973 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   11974 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11975 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
   11976 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11977 ;
   11978 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
   11979 ; SKX:       # %bb.0:
   11980 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   11981 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11982 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
   11983 ; SKX-NEXT:    retq # sched: [7:1.00]
   11984   %vec2 = load <4 x float>, <4 x float>* %vec2p
   11985   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   11986   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   11987   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   11988   ret <4 x float> %res
   11989 }
   11990 
   11991 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
   11992 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
   11993 ; GENERIC:       # %bb.0:
   11994 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   11995 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   11996 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   11997 ;
   11998 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
   11999 ; SKX:       # %bb.0:
   12000 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   12001 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
   12002 ; SKX-NEXT:    retq # sched: [7:1.00]
   12003   %vec2 = load <4 x float>, <4 x float>* %vec2p
   12004   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   12005   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   12006   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   12007   ret <4 x float> %res
   12008 }
   12009 
   12010 define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) {
   12011 ; GENERIC-LABEL: test_8xfloat_unpack_low_mask0:
   12012 ; GENERIC:       # %bb.0:
   12013 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12014 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12015 ;
   12016 ; SKX-LABEL: test_8xfloat_unpack_low_mask0:
   12017 ; SKX:       # %bb.0:
   12018 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12019 ; SKX-NEXT:    retq # sched: [7:1.00]
   12020   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12021   ret <8 x float> %res
   12022 }
   12023 define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   12024 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0:
   12025 ; GENERIC:       # %bb.0:
   12026 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   12027 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12028 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   12029 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12030 ;
   12031 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0:
   12032 ; SKX:       # %bb.0:
   12033 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   12034 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12035 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   12036 ; SKX-NEXT:    retq # sched: [7:1.00]
   12037   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12038   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12039   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   12040   ret <8 x float> %res
   12041 }
   12042 
   12043 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   12044 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
   12045 ; GENERIC:       # %bb.0:
   12046 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12047 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12048 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12049 ;
   12050 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
   12051 ; SKX:       # %bb.0:
   12052 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12053 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12054 ; SKX-NEXT:    retq # sched: [7:1.00]
   12055   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12056   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12057   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   12058   ret <8 x float> %res
   12059 }
   12060 define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   12061 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1:
   12062 ; GENERIC:       # %bb.0:
   12063 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   12064 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12065 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   12066 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12067 ;
   12068 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1:
   12069 ; SKX:       # %bb.0:
   12070 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   12071 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12072 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   12073 ; SKX-NEXT:    retq # sched: [7:1.00]
   12074   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12075   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12076   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   12077   ret <8 x float> %res
   12078 }
   12079 
   12080 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   12081 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
   12082 ; GENERIC:       # %bb.0:
   12083 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12084 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12085 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12086 ;
   12087 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
   12088 ; SKX:       # %bb.0:
   12089 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12090 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12091 ; SKX-NEXT:    retq # sched: [7:1.00]
   12092   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12093   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12094   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   12095   ret <8 x float> %res
   12096 }
   12097 define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   12098 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2:
   12099 ; GENERIC:       # %bb.0:
   12100 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   12101 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12102 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   12103 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12104 ;
   12105 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2:
   12106 ; SKX:       # %bb.0:
   12107 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   12108 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12109 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   12110 ; SKX-NEXT:    retq # sched: [7:1.00]
   12111   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12112   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12113   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   12114   ret <8 x float> %res
   12115 }
   12116 
   12117 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   12118 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
   12119 ; GENERIC:       # %bb.0:
   12120 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12121 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12122 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12123 ;
   12124 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
   12125 ; SKX:       # %bb.0:
   12126 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12127 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12128 ; SKX-NEXT:    retq # sched: [7:1.00]
   12129   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12130   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12131   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   12132   ret <8 x float> %res
   12133 }
   12134 define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) {
   12135 ; GENERIC-LABEL: test_8xfloat_unpack_low_mask3:
   12136 ; GENERIC:       # %bb.0:
   12137 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12138 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12139 ;
   12140 ; SKX-LABEL: test_8xfloat_unpack_low_mask3:
   12141 ; SKX:       # %bb.0:
   12142 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12143 ; SKX-NEXT:    retq # sched: [7:1.00]
   12144   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12145   ret <8 x float> %res
   12146 }
   12147 define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   12148 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3:
   12149 ; GENERIC:       # %bb.0:
   12150 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   12151 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12152 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   12153 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12154 ;
   12155 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3:
   12156 ; SKX:       # %bb.0:
   12157 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   12158 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12159 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   12160 ; SKX-NEXT:    retq # sched: [7:1.00]
   12161   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12162   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12163   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   12164   ret <8 x float> %res
   12165 }
   12166 
   12167 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   12168 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
   12169 ; GENERIC:       # %bb.0:
   12170 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12171 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12172 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12173 ;
   12174 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
   12175 ; SKX:       # %bb.0:
   12176 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12177 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   12178 ; SKX-NEXT:    retq # sched: [7:1.00]
   12179   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12180   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12181   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   12182   ret <8 x float> %res
   12183 }
   12184 define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
   12185 ; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask0:
   12186 ; GENERIC:       # %bb.0:
   12187 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12188 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12189 ;
   12190 ; SKX-LABEL: test_8xfloat_unpack_low_mem_mask0:
   12191 ; SKX:       # %bb.0:
   12192 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12193 ; SKX-NEXT:    retq # sched: [7:1.00]
   12194   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12195   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12196   ret <8 x float> %res
   12197 }
   12198 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   12199 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
   12200 ; GENERIC:       # %bb.0:
   12201 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12202 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12203 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   12204 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12205 ;
   12206 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
   12207 ; SKX:       # %bb.0:
   12208 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12209 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12210 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   12211 ; SKX-NEXT:    retq # sched: [7:1.00]
   12212   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12213   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12214   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12215   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   12216   ret <8 x float> %res
   12217 }
   12218 
   12219 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   12220 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
   12221 ; GENERIC:       # %bb.0:
   12222 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   12223 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12224 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12225 ;
   12226 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
   12227 ; SKX:       # %bb.0:
   12228 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   12229 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12230 ; SKX-NEXT:    retq # sched: [7:1.00]
   12231   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12232   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12233   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12234   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   12235   ret <8 x float> %res
   12236 }
   12237 
   12238 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   12239 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
   12240 ; GENERIC:       # %bb.0:
   12241 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12242 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12243 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   12244 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12245 ;
   12246 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
   12247 ; SKX:       # %bb.0:
   12248 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12249 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12250 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   12251 ; SKX-NEXT:    retq # sched: [7:1.00]
   12252   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12253   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12254   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12255   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   12256   ret <8 x float> %res
   12257 }
   12258 
   12259 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   12260 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
   12261 ; GENERIC:       # %bb.0:
   12262 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   12263 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12264 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12265 ;
   12266 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
   12267 ; SKX:       # %bb.0:
   12268 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   12269 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12270 ; SKX-NEXT:    retq # sched: [7:1.00]
   12271   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12272   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12273   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12274   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   12275   ret <8 x float> %res
   12276 }
   12277 
   12278 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   12279 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
   12280 ; GENERIC:       # %bb.0:
   12281 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12282 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12283 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   12284 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12285 ;
   12286 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
   12287 ; SKX:       # %bb.0:
   12288 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12289 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12290 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   12291 ; SKX-NEXT:    retq # sched: [7:1.00]
   12292   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12293   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12294   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12295   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   12296   ret <8 x float> %res
   12297 }
   12298 
   12299 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   12300 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
   12301 ; GENERIC:       # %bb.0:
   12302 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   12303 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12304 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12305 ;
   12306 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
   12307 ; SKX:       # %bb.0:
   12308 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   12309 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12310 ; SKX-NEXT:    retq # sched: [7:1.00]
   12311   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12312   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12313   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12314   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   12315   ret <8 x float> %res
   12316 }
   12317 
   12318 define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
   12319 ; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask3:
   12320 ; GENERIC:       # %bb.0:
   12321 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12322 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12323 ;
   12324 ; SKX-LABEL: test_8xfloat_unpack_low_mem_mask3:
   12325 ; SKX:       # %bb.0:
   12326 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12327 ; SKX-NEXT:    retq # sched: [7:1.00]
   12328   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12329   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12330   ret <8 x float> %res
   12331 }
   12332 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   12333 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
   12334 ; GENERIC:       # %bb.0:
   12335 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12336 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12337 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   12338 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12339 ;
   12340 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
   12341 ; SKX:       # %bb.0:
   12342 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12343 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12344 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   12345 ; SKX-NEXT:    retq # sched: [7:1.00]
   12346   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12347   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12348   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12349   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   12350   ret <8 x float> %res
   12351 }
   12352 
   12353 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   12354 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
   12355 ; GENERIC:       # %bb.0:
   12356 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   12357 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12358 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12359 ;
   12360 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
   12361 ; SKX:       # %bb.0:
   12362 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   12363 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   12364 ; SKX-NEXT:    retq # sched: [7:1.00]
   12365   %vec2 = load <8 x float>, <8 x float>* %vec2p
   12366   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   12367   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   12368   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   12369   ret <8 x float> %res
   12370 }
   12371 
   12372 define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
   12373 ; GENERIC-LABEL: test_16xfloat_unpack_low_mask0:
   12374 ; GENERIC:       # %bb.0:
   12375 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12376 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12377 ;
   12378 ; SKX-LABEL: test_16xfloat_unpack_low_mask0:
   12379 ; SKX:       # %bb.0:
   12380 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12381 ; SKX-NEXT:    retq # sched: [7:1.00]
   12382   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12383   ret <16 x float> %res
   12384 }
   12385 define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   12386 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0:
   12387 ; GENERIC:       # %bb.0:
   12388 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   12389 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12390 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   12391 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12392 ;
   12393 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
   12394 ; SKX:       # %bb.0:
   12395 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   12396 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12397 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   12398 ; SKX-NEXT:    retq # sched: [7:1.00]
   12399   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12400   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12401   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   12402   ret <16 x float> %res
   12403 }
   12404 
   12405 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   12406 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
   12407 ; GENERIC:       # %bb.0:
   12408 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   12409 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12410 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12411 ;
   12412 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
   12413 ; SKX:       # %bb.0:
   12414 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   12415 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12416 ; SKX-NEXT:    retq # sched: [7:1.00]
   12417   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12418   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12419   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   12420   ret <16 x float> %res
   12421 }
   12422 define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   12423 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1:
   12424 ; GENERIC:       # %bb.0:
   12425 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   12426 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12427 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   12428 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12429 ;
   12430 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
   12431 ; SKX:       # %bb.0:
   12432 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   12433 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12434 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   12435 ; SKX-NEXT:    retq # sched: [7:1.00]
   12436   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12437   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12438   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   12439   ret <16 x float> %res
   12440 }
   12441 
   12442 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   12443 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
   12444 ; GENERIC:       # %bb.0:
   12445 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   12446 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12447 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12448 ;
   12449 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
   12450 ; SKX:       # %bb.0:
   12451 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   12452 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12453 ; SKX-NEXT:    retq # sched: [7:1.00]
   12454   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12455   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12456   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   12457   ret <16 x float> %res
   12458 }
   12459 define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   12460 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2:
   12461 ; GENERIC:       # %bb.0:
   12462 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   12463 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12464 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   12465 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12466 ;
   12467 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
   12468 ; SKX:       # %bb.0:
   12469 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   12470 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12471 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   12472 ; SKX-NEXT:    retq # sched: [7:1.00]
   12473   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12474   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12475   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   12476   ret <16 x float> %res
   12477 }
   12478 
   12479 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   12480 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
   12481 ; GENERIC:       # %bb.0:
   12482 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   12483 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12484 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12485 ;
   12486 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
   12487 ; SKX:       # %bb.0:
   12488 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   12489 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12490 ; SKX-NEXT:    retq # sched: [7:1.00]
   12491   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12492   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12493   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   12494   ret <16 x float> %res
   12495 }
   12496 define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
   12497 ; GENERIC-LABEL: test_16xfloat_unpack_low_mask3:
   12498 ; GENERIC:       # %bb.0:
   12499 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12500 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12501 ;
   12502 ; SKX-LABEL: test_16xfloat_unpack_low_mask3:
   12503 ; SKX:       # %bb.0:
   12504 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12505 ; SKX-NEXT:    retq # sched: [7:1.00]
   12506   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12507   ret <16 x float> %res
   12508 }
   12509 define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   12510 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3:
   12511 ; GENERIC:       # %bb.0:
   12512 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   12513 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12514 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   12515 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12516 ;
   12517 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
   12518 ; SKX:       # %bb.0:
   12519 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   12520 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12521 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   12522 ; SKX-NEXT:    retq # sched: [7:1.00]
   12523   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12524   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12525   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   12526   ret <16 x float> %res
   12527 }
   12528 
   12529 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   12530 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
   12531 ; GENERIC:       # %bb.0:
   12532 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   12533 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12534 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12535 ;
   12536 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
   12537 ; SKX:       # %bb.0:
   12538 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   12539 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
   12540 ; SKX-NEXT:    retq # sched: [7:1.00]
   12541   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12542   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12543   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   12544   ret <16 x float> %res
   12545 }
   12546 define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
   12547 ; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask0:
   12548 ; GENERIC:       # %bb.0:
   12549 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12550 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12551 ;
   12552 ; SKX-LABEL: test_16xfloat_unpack_low_mem_mask0:
   12553 ; SKX:       # %bb.0:
   12554 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12555 ; SKX-NEXT:    retq # sched: [7:1.00]
   12556   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12557   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12558   ret <16 x float> %res
   12559 }
   12560 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   12561 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
   12562 ; GENERIC:       # %bb.0:
   12563 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   12564 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12565 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   12566 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12567 ;
   12568 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
   12569 ; SKX:       # %bb.0:
   12570 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   12571 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12572 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   12573 ; SKX-NEXT:    retq # sched: [7:1.00]
   12574   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12575   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12576   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12577   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   12578   ret <16 x float> %res
   12579 }
   12580 
   12581 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   12582 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
   12583 ; GENERIC:       # %bb.0:
   12584 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   12585 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12586 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12587 ;
   12588 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
   12589 ; SKX:       # %bb.0:
   12590 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   12591 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12592 ; SKX-NEXT:    retq # sched: [7:1.00]
   12593   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12594   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12595   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12596   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   12597   ret <16 x float> %res
   12598 }
   12599 
   12600 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   12601 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
   12602 ; GENERIC:       # %bb.0:
   12603 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   12604 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12605 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   12606 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12607 ;
   12608 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
   12609 ; SKX:       # %bb.0:
   12610 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   12611 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12612 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   12613 ; SKX-NEXT:    retq # sched: [7:1.00]
   12614   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12615   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12616   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12617   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   12618   ret <16 x float> %res
   12619 }
   12620 
   12621 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   12622 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
   12623 ; GENERIC:       # %bb.0:
   12624 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   12625 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12626 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12627 ;
   12628 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
   12629 ; SKX:       # %bb.0:
   12630 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   12631 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12632 ; SKX-NEXT:    retq # sched: [7:1.00]
   12633   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12634   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12635   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12636   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   12637   ret <16 x float> %res
   12638 }
   12639 
   12640 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   12641 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
   12642 ; GENERIC:       # %bb.0:
   12643 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   12644 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12645 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   12646 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12647 ;
   12648 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
   12649 ; SKX:       # %bb.0:
   12650 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   12651 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12652 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   12653 ; SKX-NEXT:    retq # sched: [7:1.00]
   12654   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12655   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12656   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12657   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   12658   ret <16 x float> %res
   12659 }
   12660 
   12661 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   12662 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
   12663 ; GENERIC:       # %bb.0:
   12664 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   12665 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12666 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12667 ;
   12668 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
   12669 ; SKX:       # %bb.0:
   12670 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   12671 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12672 ; SKX-NEXT:    retq # sched: [7:1.00]
   12673   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12674   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12675   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12676   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   12677   ret <16 x float> %res
   12678 }
   12679 
   12680 define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
   12681 ; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask3:
   12682 ; GENERIC:       # %bb.0:
   12683 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12684 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12685 ;
   12686 ; SKX-LABEL: test_16xfloat_unpack_low_mem_mask3:
   12687 ; SKX:       # %bb.0:
   12688 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12689 ; SKX-NEXT:    retq # sched: [7:1.00]
   12690   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12691   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12692   ret <16 x float> %res
   12693 }
   12694 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   12695 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
   12696 ; GENERIC:       # %bb.0:
   12697 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   12698 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12699 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   12700 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12701 ;
   12702 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
   12703 ; SKX:       # %bb.0:
   12704 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   12705 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12706 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   12707 ; SKX-NEXT:    retq # sched: [7:1.00]
   12708   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12709   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12710   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12711   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   12712   ret <16 x float> %res
   12713 }
   12714 
   12715 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   12716 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
   12717 ; GENERIC:       # %bb.0:
   12718 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   12719 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12720 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12721 ;
   12722 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
   12723 ; SKX:       # %bb.0:
   12724 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   12725 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
   12726 ; SKX-NEXT:    retq # sched: [7:1.00]
   12727   %vec2 = load <16 x float>, <16 x float>* %vec2p
   12728   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   12729   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   12730   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   12731   ret <16 x float> %res
   12732 }
   12733 
   12734 define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) {
   12735 ; GENERIC-LABEL: test_2xdouble_unpack_low_mask0:
   12736 ; GENERIC:       # %bb.0:
   12737 ; GENERIC-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
   12738 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12739 ;
   12740 ; SKX-LABEL: test_2xdouble_unpack_low_mask0:
   12741 ; SKX:       # %bb.0:
   12742 ; SKX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
   12743 ; SKX-NEXT:    retq # sched: [7:1.00]
   12744   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12745   ret <2 x double> %res
   12746 }
   12747 define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
   12748 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0:
   12749 ; GENERIC:       # %bb.0:
   12750 ; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
   12751 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
   12752 ; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
   12753 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12754 ;
   12755 ; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0:
   12756 ; SKX:       # %bb.0:
   12757 ; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
   12758 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
   12759 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
   12760 ; SKX-NEXT:    retq # sched: [7:1.00]
   12761   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12762   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   12763   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   12764   ret <2 x double> %res
   12765 }
   12766 
   12767 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
   12768 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
   12769 ; GENERIC:       # %bb.0:
   12770 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
   12771 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
   12772 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12773 ;
   12774 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
   12775 ; SKX:       # %bb.0:
   12776 ; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
   12777 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
   12778 ; SKX-NEXT:    retq # sched: [7:1.00]
   12779   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12780   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   12781   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   12782   ret <2 x double> %res
   12783 }
   12784 define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
   12785 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1:
   12786 ; GENERIC:       # %bb.0:
   12787 ; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
   12788 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
   12789 ; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
   12790 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12791 ;
   12792 ; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1:
   12793 ; SKX:       # %bb.0:
   12794 ; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
   12795 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
   12796 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
   12797 ; SKX-NEXT:    retq # sched: [7:1.00]
   12798   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12799   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   12800   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   12801   ret <2 x double> %res
   12802 }
   12803 
   12804 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
   12805 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
   12806 ; GENERIC:       # %bb.0:
   12807 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
   12808 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
   12809 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12810 ;
   12811 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
   12812 ; SKX:       # %bb.0:
   12813 ; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
   12814 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
   12815 ; SKX-NEXT:    retq # sched: [7:1.00]
   12816   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12817   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   12818   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   12819   ret <2 x double> %res
   12820 }
   12821 define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
   12822 ; GENERIC-LABEL: test_2xdouble_unpack_low_mem_mask0:
   12823 ; GENERIC:       # %bb.0:
   12824 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
   12825 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12826 ;
   12827 ; SKX-LABEL: test_2xdouble_unpack_low_mem_mask0:
   12828 ; SKX:       # %bb.0:
   12829 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
   12830 ; SKX-NEXT:    retq # sched: [7:1.00]
   12831   %vec2 = load <2 x double>, <2 x double>* %vec2p
   12832   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12833   ret <2 x double> %res
   12834 }
   12835 define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
   12836 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
   12837 ; GENERIC:       # %bb.0:
   12838 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
   12839 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
   12840 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
   12841 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12842 ;
   12843 ; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
   12844 ; SKX:       # %bb.0:
   12845 ; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
   12846 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
   12847 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
   12848 ; SKX-NEXT:    retq # sched: [7:1.00]
   12849   %vec2 = load <2 x double>, <2 x double>* %vec2p
   12850   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12851   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   12852   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   12853   ret <2 x double> %res
   12854 }
   12855 
   12856 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
   12857 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
   12858 ; GENERIC:       # %bb.0:
   12859 ; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
   12860 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
   12861 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12862 ;
   12863 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
   12864 ; SKX:       # %bb.0:
   12865 ; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
   12866 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
   12867 ; SKX-NEXT:    retq # sched: [7:1.00]
   12868   %vec2 = load <2 x double>, <2 x double>* %vec2p
   12869   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12870   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   12871   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   12872   ret <2 x double> %res
   12873 }
   12874 
   12875 define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
   12876 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
   12877 ; GENERIC:       # %bb.0:
   12878 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
   12879 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
   12880 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
   12881 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12882 ;
   12883 ; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
   12884 ; SKX:       # %bb.0:
   12885 ; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
   12886 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
   12887 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
   12888 ; SKX-NEXT:    retq # sched: [7:1.00]
   12889   %vec2 = load <2 x double>, <2 x double>* %vec2p
   12890   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12891   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   12892   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   12893   ret <2 x double> %res
   12894 }
   12895 
   12896 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
   12897 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
   12898 ; GENERIC:       # %bb.0:
   12899 ; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
   12900 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
   12901 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12902 ;
   12903 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
   12904 ; SKX:       # %bb.0:
   12905 ; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
   12906 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
   12907 ; SKX-NEXT:    retq # sched: [7:1.00]
   12908   %vec2 = load <2 x double>, <2 x double>* %vec2p
   12909   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   12910   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   12911   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   12912   ret <2 x double> %res
   12913 }
   12914 
   12915 define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) {
   12916 ; GENERIC-LABEL: test_4xdouble_unpack_low_mask0:
   12917 ; GENERIC:       # %bb.0:
   12918 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12919 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12920 ;
   12921 ; SKX-LABEL: test_4xdouble_unpack_low_mask0:
   12922 ; SKX:       # %bb.0:
   12923 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12924 ; SKX-NEXT:    retq # sched: [7:1.00]
   12925   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   12926   ret <4 x double> %res
   12927 }
   12928 define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   12929 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0:
   12930 ; GENERIC:       # %bb.0:
   12931 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   12932 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12933 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   12934 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12935 ;
   12936 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0:
   12937 ; SKX:       # %bb.0:
   12938 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   12939 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12940 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   12941 ; SKX-NEXT:    retq # sched: [7:1.00]
   12942   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   12943   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   12944   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   12945   ret <4 x double> %res
   12946 }
   12947 
   12948 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   12949 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
   12950 ; GENERIC:       # %bb.0:
   12951 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12952 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12953 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12954 ;
   12955 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
   12956 ; SKX:       # %bb.0:
   12957 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12958 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12959 ; SKX-NEXT:    retq # sched: [7:1.00]
   12960   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   12961   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   12962   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   12963   ret <4 x double> %res
   12964 }
   12965 define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   12966 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1:
   12967 ; GENERIC:       # %bb.0:
   12968 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   12969 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12970 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   12971 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12972 ;
   12973 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1:
   12974 ; SKX:       # %bb.0:
   12975 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   12976 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12977 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   12978 ; SKX-NEXT:    retq # sched: [7:1.00]
   12979   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   12980   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   12981   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   12982   ret <4 x double> %res
   12983 }
   12984 
   12985 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   12986 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
   12987 ; GENERIC:       # %bb.0:
   12988 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   12989 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12990 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   12991 ;
   12992 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
   12993 ; SKX:       # %bb.0:
   12994 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   12995 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   12996 ; SKX-NEXT:    retq # sched: [7:1.00]
   12997   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   12998   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   12999   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   13000   ret <4 x double> %res
   13001 }
   13002 define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   13003 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2:
   13004 ; GENERIC:       # %bb.0:
   13005 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   13006 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13007 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   13008 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13009 ;
   13010 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2:
   13011 ; SKX:       # %bb.0:
   13012 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   13013 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13014 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   13015 ; SKX-NEXT:    retq # sched: [7:1.00]
   13016   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13017   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13018   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   13019   ret <4 x double> %res
   13020 }
   13021 
   13022 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   13023 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
   13024 ; GENERIC:       # %bb.0:
   13025 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   13026 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13027 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13028 ;
   13029 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
   13030 ; SKX:       # %bb.0:
   13031 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   13032 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13033 ; SKX-NEXT:    retq # sched: [7:1.00]
   13034   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13035   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13036   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   13037   ret <4 x double> %res
   13038 }
   13039 define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) {
   13040 ; GENERIC-LABEL: test_4xdouble_unpack_low_mask3:
   13041 ; GENERIC:       # %bb.0:
   13042 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13043 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13044 ;
   13045 ; SKX-LABEL: test_4xdouble_unpack_low_mask3:
   13046 ; SKX:       # %bb.0:
   13047 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13048 ; SKX-NEXT:    retq # sched: [7:1.00]
   13049   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13050   ret <4 x double> %res
   13051 }
   13052 define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   13053 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3:
   13054 ; GENERIC:       # %bb.0:
   13055 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   13056 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13057 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   13058 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13059 ;
   13060 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3:
   13061 ; SKX:       # %bb.0:
   13062 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   13063 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13064 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   13065 ; SKX-NEXT:    retq # sched: [7:1.00]
   13066   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13067   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13068   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   13069   ret <4 x double> %res
   13070 }
   13071 
   13072 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   13073 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
   13074 ; GENERIC:       # %bb.0:
   13075 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   13076 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13077 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13078 ;
   13079 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
   13080 ; SKX:       # %bb.0:
   13081 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   13082 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   13083 ; SKX-NEXT:    retq # sched: [7:1.00]
   13084   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13085   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13086   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   13087   ret <4 x double> %res
   13088 }
   13089 define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
   13090 ; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask0:
   13091 ; GENERIC:       # %bb.0:
   13092 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13093 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13094 ;
   13095 ; SKX-LABEL: test_4xdouble_unpack_low_mem_mask0:
   13096 ; SKX:       # %bb.0:
   13097 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13098 ; SKX-NEXT:    retq # sched: [7:1.00]
   13099   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13100   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13101   ret <4 x double> %res
   13102 }
   13103 define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   13104 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
   13105 ; GENERIC:       # %bb.0:
   13106 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   13107 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13108 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   13109 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13110 ;
   13111 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
   13112 ; SKX:       # %bb.0:
   13113 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   13114 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13115 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   13116 ; SKX-NEXT:    retq # sched: [7:1.00]
   13117   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13118   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13119   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13120   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   13121   ret <4 x double> %res
   13122 }
   13123 
   13124 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   13125 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
   13126 ; GENERIC:       # %bb.0:
   13127 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   13128 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13129 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13130 ;
   13131 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
   13132 ; SKX:       # %bb.0:
   13133 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   13134 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13135 ; SKX-NEXT:    retq # sched: [7:1.00]
   13136   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13137   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13138   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13139   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   13140   ret <4 x double> %res
   13141 }
   13142 
   13143 define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   13144 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
   13145 ; GENERIC:       # %bb.0:
   13146 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   13147 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13148 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   13149 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13150 ;
   13151 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
   13152 ; SKX:       # %bb.0:
   13153 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   13154 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13155 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   13156 ; SKX-NEXT:    retq # sched: [7:1.00]
   13157   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13158   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13159   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13160   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   13161   ret <4 x double> %res
   13162 }
   13163 
   13164 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   13165 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
   13166 ; GENERIC:       # %bb.0:
   13167 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   13168 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13169 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13170 ;
   13171 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
   13172 ; SKX:       # %bb.0:
   13173 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   13174 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13175 ; SKX-NEXT:    retq # sched: [7:1.00]
   13176   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13177   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13178   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13179   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   13180   ret <4 x double> %res
   13181 }
   13182 
   13183 define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   13184 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
   13185 ; GENERIC:       # %bb.0:
   13186 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   13187 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13188 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   13189 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13190 ;
   13191 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
   13192 ; SKX:       # %bb.0:
   13193 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   13194 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13195 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   13196 ; SKX-NEXT:    retq # sched: [7:1.00]
   13197   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13198   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13199   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13200   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   13201   ret <4 x double> %res
   13202 }
   13203 
   13204 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   13205 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
   13206 ; GENERIC:       # %bb.0:
   13207 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   13208 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13209 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13210 ;
   13211 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
   13212 ; SKX:       # %bb.0:
   13213 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   13214 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13215 ; SKX-NEXT:    retq # sched: [7:1.00]
   13216   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13217   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13218   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13219   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   13220   ret <4 x double> %res
   13221 }
   13222 
   13223 define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
   13224 ; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask3:
   13225 ; GENERIC:       # %bb.0:
   13226 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13227 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13228 ;
   13229 ; SKX-LABEL: test_4xdouble_unpack_low_mem_mask3:
   13230 ; SKX:       # %bb.0:
   13231 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13232 ; SKX-NEXT:    retq # sched: [7:1.00]
   13233   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13234   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13235   ret <4 x double> %res
   13236 }
   13237 define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   13238 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
   13239 ; GENERIC:       # %bb.0:
   13240 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   13241 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13242 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   13243 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13244 ;
   13245 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
   13246 ; SKX:       # %bb.0:
   13247 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   13248 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13249 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   13250 ; SKX-NEXT:    retq # sched: [7:1.00]
   13251   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13252   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13253   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13254   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   13255   ret <4 x double> %res
   13256 }
   13257 
   13258 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   13259 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
   13260 ; GENERIC:       # %bb.0:
   13261 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   13262 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13263 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13264 ;
   13265 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
   13266 ; SKX:       # %bb.0:
   13267 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   13268 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   13269 ; SKX-NEXT:    retq # sched: [7:1.00]
   13270   %vec2 = load <4 x double>, <4 x double>* %vec2p
   13271   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   13272   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   13273   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   13274   ret <4 x double> %res
   13275 }
   13276 
   13277 define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
   13278 ; GENERIC-LABEL: test_8xdouble_unpack_low_mask0:
   13279 ; GENERIC:       # %bb.0:
   13280 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13281 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13282 ;
   13283 ; SKX-LABEL: test_8xdouble_unpack_low_mask0:
   13284 ; SKX:       # %bb.0:
   13285 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13286 ; SKX-NEXT:    retq # sched: [7:1.00]
   13287   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13288   ret <8 x double> %res
   13289 }
   13290 define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   13291 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0:
   13292 ; GENERIC:       # %bb.0:
   13293 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   13294 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13295 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   13296 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13297 ;
   13298 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
   13299 ; SKX:       # %bb.0:
   13300 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   13301 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13302 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   13303 ; SKX-NEXT:    retq # sched: [7:1.00]
   13304   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13305   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13306   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   13307   ret <8 x double> %res
   13308 }
   13309 
   13310 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   13311 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
   13312 ; GENERIC:       # %bb.0:
   13313 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   13314 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13315 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13316 ;
   13317 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
   13318 ; SKX:       # %bb.0:
   13319 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   13320 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13321 ; SKX-NEXT:    retq # sched: [7:1.00]
   13322   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13323   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13324   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   13325   ret <8 x double> %res
   13326 }
   13327 define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   13328 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1:
   13329 ; GENERIC:       # %bb.0:
   13330 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   13331 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13332 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   13333 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13334 ;
   13335 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
   13336 ; SKX:       # %bb.0:
   13337 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   13338 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13339 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   13340 ; SKX-NEXT:    retq # sched: [7:1.00]
   13341   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13342   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13343   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   13344   ret <8 x double> %res
   13345 }
   13346 
   13347 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   13348 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
   13349 ; GENERIC:       # %bb.0:
   13350 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   13351 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13352 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13353 ;
   13354 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
   13355 ; SKX:       # %bb.0:
   13356 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   13357 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13358 ; SKX-NEXT:    retq # sched: [7:1.00]
   13359   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13360   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13361   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   13362   ret <8 x double> %res
   13363 }
   13364 define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   13365 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2:
   13366 ; GENERIC:       # %bb.0:
   13367 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   13368 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13369 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   13370 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13371 ;
   13372 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
   13373 ; SKX:       # %bb.0:
   13374 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   13375 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13376 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   13377 ; SKX-NEXT:    retq # sched: [7:1.00]
   13378   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13379   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13380   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   13381   ret <8 x double> %res
   13382 }
   13383 
   13384 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   13385 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
   13386 ; GENERIC:       # %bb.0:
   13387 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   13388 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13389 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13390 ;
   13391 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
   13392 ; SKX:       # %bb.0:
   13393 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   13394 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13395 ; SKX-NEXT:    retq # sched: [7:1.00]
   13396   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13397   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13398   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   13399   ret <8 x double> %res
   13400 }
   13401 define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
   13402 ; GENERIC-LABEL: test_8xdouble_unpack_low_mask3:
   13403 ; GENERIC:       # %bb.0:
   13404 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13405 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13406 ;
   13407 ; SKX-LABEL: test_8xdouble_unpack_low_mask3:
   13408 ; SKX:       # %bb.0:
   13409 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13410 ; SKX-NEXT:    retq # sched: [7:1.00]
   13411   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13412   ret <8 x double> %res
   13413 }
   13414 define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   13415 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3:
   13416 ; GENERIC:       # %bb.0:
   13417 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   13418 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13419 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   13420 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13421 ;
   13422 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
   13423 ; SKX:       # %bb.0:
   13424 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   13425 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13426 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   13427 ; SKX-NEXT:    retq # sched: [7:1.00]
   13428   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13429   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13430   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   13431   ret <8 x double> %res
   13432 }
   13433 
   13434 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   13435 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
   13436 ; GENERIC:       # %bb.0:
   13437 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   13438 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13439 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13440 ;
   13441 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
   13442 ; SKX:       # %bb.0:
   13443 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   13444 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
   13445 ; SKX-NEXT:    retq # sched: [7:1.00]
   13446   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13447   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13448   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   13449   ret <8 x double> %res
   13450 }
   13451 define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
   13452 ; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask0:
   13453 ; GENERIC:       # %bb.0:
   13454 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13455 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13456 ;
   13457 ; SKX-LABEL: test_8xdouble_unpack_low_mem_mask0:
   13458 ; SKX:       # %bb.0:
   13459 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13460 ; SKX-NEXT:    retq # sched: [7:1.00]
   13461   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13462   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13463   ret <8 x double> %res
   13464 }
   13465 define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   13466 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
   13467 ; GENERIC:       # %bb.0:
   13468 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   13469 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13470 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   13471 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13472 ;
   13473 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
   13474 ; SKX:       # %bb.0:
   13475 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   13476 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13477 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   13478 ; SKX-NEXT:    retq # sched: [7:1.00]
   13479   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13480   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13481   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13482   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   13483   ret <8 x double> %res
   13484 }
   13485 
   13486 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   13487 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
   13488 ; GENERIC:       # %bb.0:
   13489 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   13490 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13491 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13492 ;
   13493 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
   13494 ; SKX:       # %bb.0:
   13495 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   13496 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13497 ; SKX-NEXT:    retq # sched: [7:1.00]
   13498   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13499   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13500   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13501   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   13502   ret <8 x double> %res
   13503 }
   13504 
   13505 define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   13506 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
   13507 ; GENERIC:       # %bb.0:
   13508 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   13509 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13510 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   13511 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13512 ;
   13513 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
   13514 ; SKX:       # %bb.0:
   13515 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   13516 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13517 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   13518 ; SKX-NEXT:    retq # sched: [7:1.00]
   13519   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13520   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13521   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13522   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   13523   ret <8 x double> %res
   13524 }
   13525 
   13526 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   13527 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
   13528 ; GENERIC:       # %bb.0:
   13529 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   13530 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13531 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13532 ;
   13533 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
   13534 ; SKX:       # %bb.0:
   13535 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   13536 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13537 ; SKX-NEXT:    retq # sched: [7:1.00]
   13538   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13539   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13540   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13541   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   13542   ret <8 x double> %res
   13543 }
   13544 
   13545 define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   13546 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
   13547 ; GENERIC:       # %bb.0:
   13548 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   13549 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13550 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   13551 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13552 ;
   13553 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
   13554 ; SKX:       # %bb.0:
   13555 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   13556 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13557 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   13558 ; SKX-NEXT:    retq # sched: [7:1.00]
   13559   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13560   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13561   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13562   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   13563   ret <8 x double> %res
   13564 }
   13565 
   13566 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   13567 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
   13568 ; GENERIC:       # %bb.0:
   13569 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   13570 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13571 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13572 ;
   13573 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
   13574 ; SKX:       # %bb.0:
   13575 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   13576 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13577 ; SKX-NEXT:    retq # sched: [7:1.00]
   13578   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13579   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13580   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13581   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   13582   ret <8 x double> %res
   13583 }
   13584 
   13585 define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
   13586 ; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask3:
   13587 ; GENERIC:       # %bb.0:
   13588 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13589 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13590 ;
   13591 ; SKX-LABEL: test_8xdouble_unpack_low_mem_mask3:
   13592 ; SKX:       # %bb.0:
   13593 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13594 ; SKX-NEXT:    retq # sched: [7:1.00]
   13595   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13596   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13597   ret <8 x double> %res
   13598 }
   13599 define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   13600 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
   13601 ; GENERIC:       # %bb.0:
   13602 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   13603 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13604 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   13605 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13606 ;
   13607 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
   13608 ; SKX:       # %bb.0:
   13609 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   13610 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13611 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   13612 ; SKX-NEXT:    retq # sched: [7:1.00]
   13613   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13614   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13615   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13616   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   13617   ret <8 x double> %res
   13618 }
   13619 
   13620 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   13621 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
   13622 ; GENERIC:       # %bb.0:
   13623 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   13624 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13625 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13626 ;
   13627 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
   13628 ; SKX:       # %bb.0:
   13629 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   13630 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
   13631 ; SKX-NEXT:    retq # sched: [7:1.00]
   13632   %vec2 = load <8 x double>, <8 x double>* %vec2p
   13633   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   13634   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   13635   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   13636   ret <8 x double> %res
   13637 }
   13638 
   13639 define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) {
   13640 ; GENERIC-LABEL: test_4xfloat_unpack_high_mask0:
   13641 ; GENERIC:       # %bb.0:
   13642 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13643 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13644 ;
   13645 ; SKX-LABEL: test_4xfloat_unpack_high_mask0:
   13646 ; SKX:       # %bb.0:
   13647 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13648 ; SKX-NEXT:    retq # sched: [7:1.00]
   13649   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13650   ret <4 x float> %res
   13651 }
   13652 define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
   13653 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0:
   13654 ; GENERIC:       # %bb.0:
   13655 ; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
   13656 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13657 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
   13658 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13659 ;
   13660 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0:
   13661 ; SKX:       # %bb.0:
   13662 ; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
   13663 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13664 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
   13665 ; SKX-NEXT:    retq # sched: [7:1.00]
   13666   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13667   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13668   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   13669   ret <4 x float> %res
   13670 }
   13671 
   13672 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
   13673 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
   13674 ; GENERIC:       # %bb.0:
   13675 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   13676 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13677 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13678 ;
   13679 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
   13680 ; SKX:       # %bb.0:
   13681 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   13682 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13683 ; SKX-NEXT:    retq # sched: [7:1.00]
   13684   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13685   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13686   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   13687   ret <4 x float> %res
   13688 }
   13689 define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
   13690 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1:
   13691 ; GENERIC:       # %bb.0:
   13692 ; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
   13693 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13694 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
   13695 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13696 ;
   13697 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1:
   13698 ; SKX:       # %bb.0:
   13699 ; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
   13700 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13701 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
   13702 ; SKX-NEXT:    retq # sched: [7:1.00]
   13703   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13704   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13705   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   13706   ret <4 x float> %res
   13707 }
   13708 
   13709 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
   13710 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
   13711 ; GENERIC:       # %bb.0:
   13712 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   13713 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13714 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13715 ;
   13716 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
   13717 ; SKX:       # %bb.0:
   13718 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   13719 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13720 ; SKX-NEXT:    retq # sched: [7:1.00]
   13721   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13722   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13723   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   13724   ret <4 x float> %res
   13725 }
   13726 define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
   13727 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2:
   13728 ; GENERIC:       # %bb.0:
   13729 ; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
   13730 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13731 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
   13732 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13733 ;
   13734 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2:
   13735 ; SKX:       # %bb.0:
   13736 ; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
   13737 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13738 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
   13739 ; SKX-NEXT:    retq # sched: [7:1.00]
   13740   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13741   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13742   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   13743   ret <4 x float> %res
   13744 }
   13745 
   13746 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
   13747 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
   13748 ; GENERIC:       # %bb.0:
   13749 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   13750 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13751 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13752 ;
   13753 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
   13754 ; SKX:       # %bb.0:
   13755 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   13756 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13757 ; SKX-NEXT:    retq # sched: [7:1.00]
   13758   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13759   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13760   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   13761   ret <4 x float> %res
   13762 }
   13763 define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) {
   13764 ; GENERIC-LABEL: test_4xfloat_unpack_high_mask3:
   13765 ; GENERIC:       # %bb.0:
   13766 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13767 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13768 ;
   13769 ; SKX-LABEL: test_4xfloat_unpack_high_mask3:
   13770 ; SKX:       # %bb.0:
   13771 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13772 ; SKX-NEXT:    retq # sched: [7:1.00]
   13773   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13774   ret <4 x float> %res
   13775 }
   13776 define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
   13777 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3:
   13778 ; GENERIC:       # %bb.0:
   13779 ; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
   13780 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13781 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
   13782 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13783 ;
   13784 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3:
   13785 ; SKX:       # %bb.0:
   13786 ; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
   13787 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13788 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
   13789 ; SKX-NEXT:    retq # sched: [7:1.00]
   13790   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13791   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13792   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   13793   ret <4 x float> %res
   13794 }
   13795 
   13796 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
   13797 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
   13798 ; GENERIC:       # %bb.0:
   13799 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   13800 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13801 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13802 ;
   13803 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
   13804 ; SKX:       # %bb.0:
   13805 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   13806 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
   13807 ; SKX-NEXT:    retq # sched: [7:1.00]
   13808   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13809   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13810   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   13811   ret <4 x float> %res
   13812 }
   13813 define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
   13814 ; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask0:
   13815 ; GENERIC:       # %bb.0:
   13816 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13817 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13818 ;
   13819 ; SKX-LABEL: test_4xfloat_unpack_high_mem_mask0:
   13820 ; SKX:       # %bb.0:
   13821 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13822 ; SKX-NEXT:    retq # sched: [7:1.00]
   13823   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13824   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13825   ret <4 x float> %res
   13826 }
   13827 define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
   13828 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
   13829 ; GENERIC:       # %bb.0:
   13830 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   13831 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13832 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
   13833 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13834 ;
   13835 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
   13836 ; SKX:       # %bb.0:
   13837 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   13838 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13839 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
   13840 ; SKX-NEXT:    retq # sched: [7:1.00]
   13841   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13842   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13843   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13844   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   13845   ret <4 x float> %res
   13846 }
   13847 
   13848 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
   13849 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
   13850 ; GENERIC:       # %bb.0:
   13851 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   13852 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13853 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13854 ;
   13855 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
   13856 ; SKX:       # %bb.0:
   13857 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   13858 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13859 ; SKX-NEXT:    retq # sched: [7:1.00]
   13860   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13861   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13862   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13863   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   13864   ret <4 x float> %res
   13865 }
   13866 
   13867 define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
   13868 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
   13869 ; GENERIC:       # %bb.0:
   13870 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   13871 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13872 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
   13873 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13874 ;
   13875 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
   13876 ; SKX:       # %bb.0:
   13877 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   13878 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13879 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
   13880 ; SKX-NEXT:    retq # sched: [7:1.00]
   13881   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13882   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13883   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13884   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   13885   ret <4 x float> %res
   13886 }
   13887 
   13888 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
   13889 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
   13890 ; GENERIC:       # %bb.0:
   13891 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   13892 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13893 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13894 ;
   13895 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
   13896 ; SKX:       # %bb.0:
   13897 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   13898 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13899 ; SKX-NEXT:    retq # sched: [7:1.00]
   13900   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13901   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13902   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13903   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   13904   ret <4 x float> %res
   13905 }
   13906 
   13907 define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
   13908 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
   13909 ; GENERIC:       # %bb.0:
   13910 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   13911 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13912 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
   13913 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13914 ;
   13915 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
   13916 ; SKX:       # %bb.0:
   13917 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   13918 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13919 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
   13920 ; SKX-NEXT:    retq # sched: [7:1.00]
   13921   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13922   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13923   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13924   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   13925   ret <4 x float> %res
   13926 }
   13927 
   13928 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
   13929 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
   13930 ; GENERIC:       # %bb.0:
   13931 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   13932 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13933 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13934 ;
   13935 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
   13936 ; SKX:       # %bb.0:
   13937 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   13938 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13939 ; SKX-NEXT:    retq # sched: [7:1.00]
   13940   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13941   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13942   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13943   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   13944   ret <4 x float> %res
   13945 }
   13946 
   13947 define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
   13948 ; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask3:
   13949 ; GENERIC:       # %bb.0:
   13950 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13951 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13952 ;
   13953 ; SKX-LABEL: test_4xfloat_unpack_high_mem_mask3:
   13954 ; SKX:       # %bb.0:
   13955 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13956 ; SKX-NEXT:    retq # sched: [7:1.00]
   13957   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13958   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13959   ret <4 x float> %res
   13960 }
   13961 define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
   13962 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
   13963 ; GENERIC:       # %bb.0:
   13964 ; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
   13965 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13966 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
   13967 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13968 ;
   13969 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
   13970 ; SKX:       # %bb.0:
   13971 ; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
   13972 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13973 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
   13974 ; SKX-NEXT:    retq # sched: [7:1.00]
   13975   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13976   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13977   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13978   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   13979   ret <4 x float> %res
   13980 }
   13981 
   13982 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
   13983 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
   13984 ; GENERIC:       # %bb.0:
   13985 ; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
   13986 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13987 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   13988 ;
   13989 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
   13990 ; SKX:       # %bb.0:
   13991 ; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
   13992 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
   13993 ; SKX-NEXT:    retq # sched: [7:1.00]
   13994   %vec2 = load <4 x float>, <4 x float>* %vec2p
   13995   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   13996   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   13997   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   13998   ret <4 x float> %res
   13999 }
   14000 
   14001 define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) {
   14002 ; GENERIC-LABEL: test_8xfloat_unpack_high_mask0:
   14003 ; GENERIC:       # %bb.0:
   14004 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14005 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14006 ;
   14007 ; SKX-LABEL: test_8xfloat_unpack_high_mask0:
   14008 ; SKX:       # %bb.0:
   14009 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14010 ; SKX-NEXT:    retq # sched: [7:1.00]
   14011   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14012   ret <8 x float> %res
   14013 }
   14014 define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   14015 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0:
   14016 ; GENERIC:       # %bb.0:
   14017 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   14018 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14019 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   14020 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14021 ;
   14022 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0:
   14023 ; SKX:       # %bb.0:
   14024 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   14025 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14026 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   14027 ; SKX-NEXT:    retq # sched: [7:1.00]
   14028   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14029   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14030   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   14031   ret <8 x float> %res
   14032 }
   14033 
   14034 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   14035 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
   14036 ; GENERIC:       # %bb.0:
   14037 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14038 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14039 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14040 ;
   14041 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
   14042 ; SKX:       # %bb.0:
   14043 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14044 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14045 ; SKX-NEXT:    retq # sched: [7:1.00]
   14046   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14047   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14048   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   14049   ret <8 x float> %res
   14050 }
   14051 define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   14052 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1:
   14053 ; GENERIC:       # %bb.0:
   14054 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   14055 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14056 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   14057 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14058 ;
   14059 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1:
   14060 ; SKX:       # %bb.0:
   14061 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   14062 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14063 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   14064 ; SKX-NEXT:    retq # sched: [7:1.00]
   14065   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14066   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14067   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   14068   ret <8 x float> %res
   14069 }
   14070 
   14071 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   14072 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
   14073 ; GENERIC:       # %bb.0:
   14074 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14075 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14076 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14077 ;
   14078 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
   14079 ; SKX:       # %bb.0:
   14080 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14081 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14082 ; SKX-NEXT:    retq # sched: [7:1.00]
   14083   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14084   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14085   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   14086   ret <8 x float> %res
   14087 }
   14088 define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   14089 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2:
   14090 ; GENERIC:       # %bb.0:
   14091 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   14092 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14093 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   14094 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14095 ;
   14096 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2:
   14097 ; SKX:       # %bb.0:
   14098 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   14099 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14100 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   14101 ; SKX-NEXT:    retq # sched: [7:1.00]
   14102   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14103   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14104   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   14105   ret <8 x float> %res
   14106 }
   14107 
   14108 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   14109 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
   14110 ; GENERIC:       # %bb.0:
   14111 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14112 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14113 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14114 ;
   14115 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
   14116 ; SKX:       # %bb.0:
   14117 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14118 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14119 ; SKX-NEXT:    retq # sched: [7:1.00]
   14120   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14121   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14122   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   14123   ret <8 x float> %res
   14124 }
   14125 define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) {
   14126 ; GENERIC-LABEL: test_8xfloat_unpack_high_mask3:
   14127 ; GENERIC:       # %bb.0:
   14128 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14129 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14130 ;
   14131 ; SKX-LABEL: test_8xfloat_unpack_high_mask3:
   14132 ; SKX:       # %bb.0:
   14133 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14134 ; SKX-NEXT:    retq # sched: [7:1.00]
   14135   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14136   ret <8 x float> %res
   14137 }
   14138 define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
   14139 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3:
   14140 ; GENERIC:       # %bb.0:
   14141 ; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
   14142 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14143 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
   14144 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14145 ;
   14146 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3:
   14147 ; SKX:       # %bb.0:
   14148 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
   14149 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14150 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
   14151 ; SKX-NEXT:    retq # sched: [7:1.00]
   14152   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14153   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14154   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   14155   ret <8 x float> %res
   14156 }
   14157 
   14158 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
   14159 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
   14160 ; GENERIC:       # %bb.0:
   14161 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14162 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14163 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14164 ;
   14165 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
   14166 ; SKX:       # %bb.0:
   14167 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14168 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   14169 ; SKX-NEXT:    retq # sched: [7:1.00]
   14170   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14171   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14172   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   14173   ret <8 x float> %res
   14174 }
   14175 define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
   14176 ; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask0:
   14177 ; GENERIC:       # %bb.0:
   14178 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14179 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14180 ;
   14181 ; SKX-LABEL: test_8xfloat_unpack_high_mem_mask0:
   14182 ; SKX:       # %bb.0:
   14183 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14184 ; SKX-NEXT:    retq # sched: [7:1.00]
   14185   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14186   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14187   ret <8 x float> %res
   14188 }
   14189 define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   14190 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
   14191 ; GENERIC:       # %bb.0:
   14192 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14193 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14194 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   14195 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14196 ;
   14197 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
   14198 ; SKX:       # %bb.0:
   14199 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14200 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14201 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   14202 ; SKX-NEXT:    retq # sched: [7:1.00]
   14203   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14204   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14205   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14206   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   14207   ret <8 x float> %res
   14208 }
   14209 
   14210 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   14211 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
   14212 ; GENERIC:       # %bb.0:
   14213 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   14214 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14215 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14216 ;
   14217 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
   14218 ; SKX:       # %bb.0:
   14219 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   14220 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14221 ; SKX-NEXT:    retq # sched: [7:1.00]
   14222   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14223   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14224   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14225   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   14226   ret <8 x float> %res
   14227 }
   14228 
   14229 define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   14230 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
   14231 ; GENERIC:       # %bb.0:
   14232 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14233 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14234 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   14235 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14236 ;
   14237 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
   14238 ; SKX:       # %bb.0:
   14239 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14240 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14241 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   14242 ; SKX-NEXT:    retq # sched: [7:1.00]
   14243   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14244   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14245   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14246   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   14247   ret <8 x float> %res
   14248 }
   14249 
   14250 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   14251 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
   14252 ; GENERIC:       # %bb.0:
   14253 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   14254 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14255 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14256 ;
   14257 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
   14258 ; SKX:       # %bb.0:
   14259 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   14260 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14261 ; SKX-NEXT:    retq # sched: [7:1.00]
   14262   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14263   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14264   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14265   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   14266   ret <8 x float> %res
   14267 }
   14268 
   14269 define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   14270 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
   14271 ; GENERIC:       # %bb.0:
   14272 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14273 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14274 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   14275 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14276 ;
   14277 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
   14278 ; SKX:       # %bb.0:
   14279 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14280 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14281 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   14282 ; SKX-NEXT:    retq # sched: [7:1.00]
   14283   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14284   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14285   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14286   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   14287   ret <8 x float> %res
   14288 }
   14289 
   14290 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   14291 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
   14292 ; GENERIC:       # %bb.0:
   14293 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   14294 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14295 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14296 ;
   14297 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
   14298 ; SKX:       # %bb.0:
   14299 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   14300 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14301 ; SKX-NEXT:    retq # sched: [7:1.00]
   14302   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14303   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14304   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14305   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   14306   ret <8 x float> %res
   14307 }
   14308 
   14309 define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
   14310 ; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask3:
   14311 ; GENERIC:       # %bb.0:
   14312 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14313 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14314 ;
   14315 ; SKX-LABEL: test_8xfloat_unpack_high_mem_mask3:
   14316 ; SKX:       # %bb.0:
   14317 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14318 ; SKX-NEXT:    retq # sched: [7:1.00]
   14319   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14320   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14321   ret <8 x float> %res
   14322 }
   14323 define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
   14324 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
   14325 ; GENERIC:       # %bb.0:
   14326 ; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14327 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14328 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
   14329 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14330 ;
   14331 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
   14332 ; SKX:       # %bb.0:
   14333 ; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14334 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14335 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
   14336 ; SKX-NEXT:    retq # sched: [7:1.00]
   14337   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14338   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14339   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14340   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   14341   ret <8 x float> %res
   14342 }
   14343 
   14344 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
   14345 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
   14346 ; GENERIC:       # %bb.0:
   14347 ; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
   14348 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14349 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14350 ;
   14351 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
   14352 ; SKX:       # %bb.0:
   14353 ; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
   14354 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   14355 ; SKX-NEXT:    retq # sched: [7:1.00]
   14356   %vec2 = load <8 x float>, <8 x float>* %vec2p
   14357   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   14358   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   14359   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   14360   ret <8 x float> %res
   14361 }
   14362 
   14363 define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
   14364 ; GENERIC-LABEL: test_16xfloat_unpack_high_mask0:
   14365 ; GENERIC:       # %bb.0:
   14366 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14367 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14368 ;
   14369 ; SKX-LABEL: test_16xfloat_unpack_high_mask0:
   14370 ; SKX:       # %bb.0:
   14371 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14372 ; SKX-NEXT:    retq # sched: [7:1.00]
   14373   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14374   ret <16 x float> %res
   14375 }
   14376 define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   14377 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0:
   14378 ; GENERIC:       # %bb.0:
   14379 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   14380 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14381 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   14382 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14383 ;
   14384 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
   14385 ; SKX:       # %bb.0:
   14386 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   14387 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14388 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   14389 ; SKX-NEXT:    retq # sched: [7:1.00]
   14390   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14391   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14392   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   14393   ret <16 x float> %res
   14394 }
   14395 
   14396 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   14397 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
   14398 ; GENERIC:       # %bb.0:
   14399 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   14400 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14401 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14402 ;
   14403 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
   14404 ; SKX:       # %bb.0:
   14405 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   14406 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14407 ; SKX-NEXT:    retq # sched: [7:1.00]
   14408   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14409   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14410   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   14411   ret <16 x float> %res
   14412 }
   14413 define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   14414 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1:
   14415 ; GENERIC:       # %bb.0:
   14416 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   14417 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14418 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   14419 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14420 ;
   14421 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
   14422 ; SKX:       # %bb.0:
   14423 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   14424 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14425 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   14426 ; SKX-NEXT:    retq # sched: [7:1.00]
   14427   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14428   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14429   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   14430   ret <16 x float> %res
   14431 }
   14432 
   14433 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   14434 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
   14435 ; GENERIC:       # %bb.0:
   14436 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   14437 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14438 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14439 ;
   14440 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
   14441 ; SKX:       # %bb.0:
   14442 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   14443 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14444 ; SKX-NEXT:    retq # sched: [7:1.00]
   14445   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14446   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14447   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   14448   ret <16 x float> %res
   14449 }
   14450 define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   14451 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2:
   14452 ; GENERIC:       # %bb.0:
   14453 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   14454 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14455 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   14456 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14457 ;
   14458 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
   14459 ; SKX:       # %bb.0:
   14460 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   14461 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14462 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   14463 ; SKX-NEXT:    retq # sched: [7:1.00]
   14464   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14465   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14466   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   14467   ret <16 x float> %res
   14468 }
   14469 
   14470 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   14471 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
   14472 ; GENERIC:       # %bb.0:
   14473 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   14474 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14475 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14476 ;
   14477 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
   14478 ; SKX:       # %bb.0:
   14479 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   14480 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14481 ; SKX-NEXT:    retq # sched: [7:1.00]
   14482   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14483   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14484   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   14485   ret <16 x float> %res
   14486 }
   14487 define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
   14488 ; GENERIC-LABEL: test_16xfloat_unpack_high_mask3:
   14489 ; GENERIC:       # %bb.0:
   14490 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14491 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14492 ;
   14493 ; SKX-LABEL: test_16xfloat_unpack_high_mask3:
   14494 ; SKX:       # %bb.0:
   14495 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14496 ; SKX-NEXT:    retq # sched: [7:1.00]
   14497   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14498   ret <16 x float> %res
   14499 }
   14500 define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
   14501 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3:
   14502 ; GENERIC:       # %bb.0:
   14503 ; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
   14504 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14505 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
   14506 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14507 ;
   14508 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
   14509 ; SKX:       # %bb.0:
   14510 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
   14511 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14512 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
   14513 ; SKX-NEXT:    retq # sched: [7:1.00]
   14514   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14515   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14516   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   14517   ret <16 x float> %res
   14518 }
   14519 
   14520 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
   14521 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
   14522 ; GENERIC:       # %bb.0:
   14523 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   14524 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14525 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14526 ;
   14527 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
   14528 ; SKX:       # %bb.0:
   14529 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   14530 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
   14531 ; SKX-NEXT:    retq # sched: [7:1.00]
   14532   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14533   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14534   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   14535   ret <16 x float> %res
   14536 }
   14537 define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
   14538 ; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask0:
   14539 ; GENERIC:       # %bb.0:
   14540 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14541 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14542 ;
   14543 ; SKX-LABEL: test_16xfloat_unpack_high_mem_mask0:
   14544 ; SKX:       # %bb.0:
   14545 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14546 ; SKX-NEXT:    retq # sched: [7:1.00]
   14547   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14548   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14549   ret <16 x float> %res
   14550 }
   14551 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   14552 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
   14553 ; GENERIC:       # %bb.0:
   14554 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   14555 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14556 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   14557 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14558 ;
   14559 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
   14560 ; SKX:       # %bb.0:
   14561 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   14562 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14563 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   14564 ; SKX-NEXT:    retq # sched: [7:1.00]
   14565   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14566   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14567   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14568   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   14569   ret <16 x float> %res
   14570 }
   14571 
   14572 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   14573 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
   14574 ; GENERIC:       # %bb.0:
   14575 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   14576 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14577 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14578 ;
   14579 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
   14580 ; SKX:       # %bb.0:
   14581 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   14582 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14583 ; SKX-NEXT:    retq # sched: [7:1.00]
   14584   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14585   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14586   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14587   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   14588   ret <16 x float> %res
   14589 }
   14590 
   14591 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   14592 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
   14593 ; GENERIC:       # %bb.0:
   14594 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   14595 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14596 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   14597 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14598 ;
   14599 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
   14600 ; SKX:       # %bb.0:
   14601 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   14602 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14603 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   14604 ; SKX-NEXT:    retq # sched: [7:1.00]
   14605   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14606   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14607   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14608   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   14609   ret <16 x float> %res
   14610 }
   14611 
   14612 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   14613 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
   14614 ; GENERIC:       # %bb.0:
   14615 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   14616 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14617 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14618 ;
   14619 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
   14620 ; SKX:       # %bb.0:
   14621 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   14622 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14623 ; SKX-NEXT:    retq # sched: [7:1.00]
   14624   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14625   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14626   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14627   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   14628   ret <16 x float> %res
   14629 }
   14630 
   14631 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   14632 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
   14633 ; GENERIC:       # %bb.0:
   14634 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   14635 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14636 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   14637 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14638 ;
   14639 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
   14640 ; SKX:       # %bb.0:
   14641 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   14642 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14643 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   14644 ; SKX-NEXT:    retq # sched: [7:1.00]
   14645   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14646   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14647   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14648   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   14649   ret <16 x float> %res
   14650 }
   14651 
   14652 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   14653 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
   14654 ; GENERIC:       # %bb.0:
   14655 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   14656 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14657 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14658 ;
   14659 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
   14660 ; SKX:       # %bb.0:
   14661 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   14662 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14663 ; SKX-NEXT:    retq # sched: [7:1.00]
   14664   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14665   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14666   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14667   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   14668   ret <16 x float> %res
   14669 }
   14670 
   14671 define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
   14672 ; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask3:
   14673 ; GENERIC:       # %bb.0:
   14674 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14675 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14676 ;
   14677 ; SKX-LABEL: test_16xfloat_unpack_high_mem_mask3:
   14678 ; SKX:       # %bb.0:
   14679 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14680 ; SKX-NEXT:    retq # sched: [7:1.00]
   14681   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14682   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14683   ret <16 x float> %res
   14684 }
   14685 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
   14686 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
   14687 ; GENERIC:       # %bb.0:
   14688 ; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
   14689 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14690 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
   14691 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14692 ;
   14693 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
   14694 ; SKX:       # %bb.0:
   14695 ; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
   14696 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14697 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
   14698 ; SKX-NEXT:    retq # sched: [7:1.00]
   14699   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14700   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14701   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14702   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   14703   ret <16 x float> %res
   14704 }
   14705 
   14706 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
   14707 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
   14708 ; GENERIC:       # %bb.0:
   14709 ; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
   14710 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14711 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14712 ;
   14713 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
   14714 ; SKX:       # %bb.0:
   14715 ; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
   14716 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
   14717 ; SKX-NEXT:    retq # sched: [7:1.00]
   14718   %vec2 = load <16 x float>, <16 x float>* %vec2p
   14719   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   14720   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   14721   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   14722   ret <16 x float> %res
   14723 }
   14724 
   14725 define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) {
   14726 ; GENERIC-LABEL: test_2xdouble_unpack_high_mask0:
   14727 ; GENERIC:       # %bb.0:
   14728 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
   14729 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14730 ;
   14731 ; SKX-LABEL: test_2xdouble_unpack_high_mask0:
   14732 ; SKX:       # %bb.0:
   14733 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
   14734 ; SKX-NEXT:    retq # sched: [7:1.00]
   14735   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14736   ret <2 x double> %res
   14737 }
   14738 define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
   14739 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0:
   14740 ; GENERIC:       # %bb.0:
   14741 ; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
   14742 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
   14743 ; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
   14744 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14745 ;
   14746 ; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0:
   14747 ; SKX:       # %bb.0:
   14748 ; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
   14749 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
   14750 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
   14751 ; SKX-NEXT:    retq # sched: [7:1.00]
   14752   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14753   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   14754   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   14755   ret <2 x double> %res
   14756 }
   14757 
   14758 define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
   14759 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
   14760 ; GENERIC:       # %bb.0:
   14761 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
   14762 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
   14763 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14764 ;
   14765 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
   14766 ; SKX:       # %bb.0:
   14767 ; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
   14768 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
   14769 ; SKX-NEXT:    retq # sched: [7:1.00]
   14770   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14771   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   14772   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   14773   ret <2 x double> %res
   14774 }
   14775 define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
   14776 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1:
   14777 ; GENERIC:       # %bb.0:
   14778 ; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
   14779 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
   14780 ; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
   14781 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14782 ;
   14783 ; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1:
   14784 ; SKX:       # %bb.0:
   14785 ; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
   14786 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
   14787 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
   14788 ; SKX-NEXT:    retq # sched: [7:1.00]
   14789   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14790   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   14791   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   14792   ret <2 x double> %res
   14793 }
   14794 
   14795 define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
   14796 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
   14797 ; GENERIC:       # %bb.0:
   14798 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
   14799 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
   14800 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14801 ;
   14802 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
   14803 ; SKX:       # %bb.0:
   14804 ; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
   14805 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
   14806 ; SKX-NEXT:    retq # sched: [7:1.00]
   14807   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14808   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   14809   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   14810   ret <2 x double> %res
   14811 }
   14812 define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
   14813 ; GENERIC-LABEL: test_2xdouble_unpack_high_mem_mask0:
   14814 ; GENERIC:       # %bb.0:
   14815 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
   14816 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14817 ;
   14818 ; SKX-LABEL: test_2xdouble_unpack_high_mem_mask0:
   14819 ; SKX:       # %bb.0:
   14820 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
   14821 ; SKX-NEXT:    retq # sched: [7:1.00]
   14822   %vec2 = load <2 x double>, <2 x double>* %vec2p
   14823   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14824   ret <2 x double> %res
   14825 }
   14826 define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
   14827 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
   14828 ; GENERIC:       # %bb.0:
   14829 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
   14830 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
   14831 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
   14832 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14833 ;
   14834 ; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
   14835 ; SKX:       # %bb.0:
   14836 ; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
   14837 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
   14838 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
   14839 ; SKX-NEXT:    retq # sched: [7:1.00]
   14840   %vec2 = load <2 x double>, <2 x double>* %vec2p
   14841   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14842   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   14843   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   14844   ret <2 x double> %res
   14845 }
   14846 
   14847 define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
   14848 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
   14849 ; GENERIC:       # %bb.0:
   14850 ; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
   14851 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
   14852 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14853 ;
   14854 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
   14855 ; SKX:       # %bb.0:
   14856 ; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
   14857 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
   14858 ; SKX-NEXT:    retq # sched: [7:1.00]
   14859   %vec2 = load <2 x double>, <2 x double>* %vec2p
   14860   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14861   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   14862   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   14863   ret <2 x double> %res
   14864 }
   14865 
   14866 define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
   14867 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
   14868 ; GENERIC:       # %bb.0:
   14869 ; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
   14870 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
   14871 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
   14872 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14873 ;
   14874 ; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
   14875 ; SKX:       # %bb.0:
   14876 ; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
   14877 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
   14878 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
   14879 ; SKX-NEXT:    retq # sched: [7:1.00]
   14880   %vec2 = load <2 x double>, <2 x double>* %vec2p
   14881   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14882   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   14883   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   14884   ret <2 x double> %res
   14885 }
   14886 
   14887 define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
   14888 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
   14889 ; GENERIC:       # %bb.0:
   14890 ; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
   14891 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
   14892 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14893 ;
   14894 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
   14895 ; SKX:       # %bb.0:
   14896 ; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
   14897 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
   14898 ; SKX-NEXT:    retq # sched: [7:1.00]
   14899   %vec2 = load <2 x double>, <2 x double>* %vec2p
   14900   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   14901   %cmp = icmp eq <2 x i64> %mask, zeroinitializer
   14902   %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   14903   ret <2 x double> %res
   14904 }
   14905 
   14906 define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) {
   14907 ; GENERIC-LABEL: test_4xdouble_unpack_high_mask0:
   14908 ; GENERIC:       # %bb.0:
   14909 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14910 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14911 ;
   14912 ; SKX-LABEL: test_4xdouble_unpack_high_mask0:
   14913 ; SKX:       # %bb.0:
   14914 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14915 ; SKX-NEXT:    retq # sched: [7:1.00]
   14916   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   14917   ret <4 x double> %res
   14918 }
   14919 define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   14920 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0:
   14921 ; GENERIC:       # %bb.0:
   14922 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   14923 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14924 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   14925 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14926 ;
   14927 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0:
   14928 ; SKX:       # %bb.0:
   14929 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   14930 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14931 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   14932 ; SKX-NEXT:    retq # sched: [7:1.00]
   14933   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   14934   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   14935   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   14936   ret <4 x double> %res
   14937 }
   14938 
   14939 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   14940 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
   14941 ; GENERIC:       # %bb.0:
   14942 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14943 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14944 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14945 ;
   14946 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
   14947 ; SKX:       # %bb.0:
   14948 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14949 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14950 ; SKX-NEXT:    retq # sched: [7:1.00]
   14951   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   14952   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   14953   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   14954   ret <4 x double> %res
   14955 }
   14956 define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   14957 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1:
   14958 ; GENERIC:       # %bb.0:
   14959 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   14960 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14961 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   14962 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14963 ;
   14964 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1:
   14965 ; SKX:       # %bb.0:
   14966 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   14967 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14968 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   14969 ; SKX-NEXT:    retq # sched: [7:1.00]
   14970   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   14971   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   14972   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   14973   ret <4 x double> %res
   14974 }
   14975 
   14976 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   14977 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
   14978 ; GENERIC:       # %bb.0:
   14979 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   14980 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14981 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   14982 ;
   14983 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
   14984 ; SKX:       # %bb.0:
   14985 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   14986 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14987 ; SKX-NEXT:    retq # sched: [7:1.00]
   14988   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   14989   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   14990   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   14991   ret <4 x double> %res
   14992 }
   14993 define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   14994 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2:
   14995 ; GENERIC:       # %bb.0:
   14996 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   14997 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   14998 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   14999 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15000 ;
   15001 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2:
   15002 ; SKX:       # %bb.0:
   15003 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   15004 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15005 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   15006 ; SKX-NEXT:    retq # sched: [7:1.00]
   15007   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15008   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15009   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   15010   ret <4 x double> %res
   15011 }
   15012 
   15013 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   15014 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
   15015 ; GENERIC:       # %bb.0:
   15016 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   15017 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15018 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15019 ;
   15020 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
   15021 ; SKX:       # %bb.0:
   15022 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   15023 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15024 ; SKX-NEXT:    retq # sched: [7:1.00]
   15025   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15026   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15027   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   15028   ret <4 x double> %res
   15029 }
   15030 define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) {
   15031 ; GENERIC-LABEL: test_4xdouble_unpack_high_mask3:
   15032 ; GENERIC:       # %bb.0:
   15033 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15034 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15035 ;
   15036 ; SKX-LABEL: test_4xdouble_unpack_high_mask3:
   15037 ; SKX:       # %bb.0:
   15038 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15039 ; SKX-NEXT:    retq # sched: [7:1.00]
   15040   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15041   ret <4 x double> %res
   15042 }
   15043 define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
   15044 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3:
   15045 ; GENERIC:       # %bb.0:
   15046 ; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
   15047 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15048 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
   15049 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15050 ;
   15051 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3:
   15052 ; SKX:       # %bb.0:
   15053 ; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
   15054 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15055 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
   15056 ; SKX-NEXT:    retq # sched: [7:1.00]
   15057   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15058   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15059   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   15060   ret <4 x double> %res
   15061 }
   15062 
   15063 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
   15064 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
   15065 ; GENERIC:       # %bb.0:
   15066 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   15067 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15068 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15069 ;
   15070 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
   15071 ; SKX:       # %bb.0:
   15072 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   15073 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   15074 ; SKX-NEXT:    retq # sched: [7:1.00]
   15075   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15076   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15077   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   15078   ret <4 x double> %res
   15079 }
   15080 define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
   15081 ; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask0:
   15082 ; GENERIC:       # %bb.0:
   15083 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15084 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15085 ;
   15086 ; SKX-LABEL: test_4xdouble_unpack_high_mem_mask0:
   15087 ; SKX:       # %bb.0:
   15088 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15089 ; SKX-NEXT:    retq # sched: [7:1.00]
   15090   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15091   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15092   ret <4 x double> %res
   15093 }
   15094 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   15095 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
   15096 ; GENERIC:       # %bb.0:
   15097 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   15098 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15099 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   15100 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15101 ;
   15102 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
   15103 ; SKX:       # %bb.0:
   15104 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   15105 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15106 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   15107 ; SKX-NEXT:    retq # sched: [7:1.00]
   15108   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15109   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15110   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15111   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   15112   ret <4 x double> %res
   15113 }
   15114 
   15115 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   15116 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
   15117 ; GENERIC:       # %bb.0:
   15118 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   15119 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15120 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15121 ;
   15122 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
   15123 ; SKX:       # %bb.0:
   15124 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   15125 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15126 ; SKX-NEXT:    retq # sched: [7:1.00]
   15127   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15128   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15129   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15130   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   15131   ret <4 x double> %res
   15132 }
   15133 
   15134 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   15135 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
   15136 ; GENERIC:       # %bb.0:
   15137 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   15138 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15139 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   15140 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15141 ;
   15142 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
   15143 ; SKX:       # %bb.0:
   15144 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   15145 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15146 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   15147 ; SKX-NEXT:    retq # sched: [7:1.00]
   15148   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15149   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15150   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15151   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   15152   ret <4 x double> %res
   15153 }
   15154 
   15155 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   15156 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
   15157 ; GENERIC:       # %bb.0:
   15158 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   15159 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15160 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15161 ;
   15162 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
   15163 ; SKX:       # %bb.0:
   15164 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   15165 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15166 ; SKX-NEXT:    retq # sched: [7:1.00]
   15167   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15168   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15169   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15170   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   15171   ret <4 x double> %res
   15172 }
   15173 
   15174 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   15175 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
   15176 ; GENERIC:       # %bb.0:
   15177 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   15178 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15179 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   15180 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15181 ;
   15182 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
   15183 ; SKX:       # %bb.0:
   15184 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   15185 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15186 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   15187 ; SKX-NEXT:    retq # sched: [7:1.00]
   15188   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15189   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15190   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15191   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   15192   ret <4 x double> %res
   15193 }
   15194 
   15195 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   15196 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
   15197 ; GENERIC:       # %bb.0:
   15198 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   15199 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15200 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15201 ;
   15202 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
   15203 ; SKX:       # %bb.0:
   15204 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   15205 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15206 ; SKX-NEXT:    retq # sched: [7:1.00]
   15207   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15208   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15209   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15210   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   15211   ret <4 x double> %res
   15212 }
   15213 
   15214 define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
   15215 ; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask3:
   15216 ; GENERIC:       # %bb.0:
   15217 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15218 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15219 ;
   15220 ; SKX-LABEL: test_4xdouble_unpack_high_mem_mask3:
   15221 ; SKX:       # %bb.0:
   15222 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15223 ; SKX-NEXT:    retq # sched: [7:1.00]
   15224   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15225   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15226   ret <4 x double> %res
   15227 }
   15228 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
   15229 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
   15230 ; GENERIC:       # %bb.0:
   15231 ; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
   15232 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15233 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
   15234 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15235 ;
   15236 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
   15237 ; SKX:       # %bb.0:
   15238 ; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
   15239 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15240 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
   15241 ; SKX-NEXT:    retq # sched: [7:1.00]
   15242   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15243   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15244   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15245   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   15246   ret <4 x double> %res
   15247 }
   15248 
   15249 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
   15250 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
   15251 ; GENERIC:       # %bb.0:
   15252 ; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
   15253 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15254 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15255 ;
   15256 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
   15257 ; SKX:       # %bb.0:
   15258 ; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
   15259 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   15260 ; SKX-NEXT:    retq # sched: [7:1.00]
   15261   %vec2 = load <4 x double>, <4 x double>* %vec2p
   15262   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   15263   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   15264   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   15265   ret <4 x double> %res
   15266 }
   15267 
   15268 define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
   15269 ; GENERIC-LABEL: test_8xdouble_unpack_high_mask0:
   15270 ; GENERIC:       # %bb.0:
   15271 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15272 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15273 ;
   15274 ; SKX-LABEL: test_8xdouble_unpack_high_mask0:
   15275 ; SKX:       # %bb.0:
   15276 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15277 ; SKX-NEXT:    retq # sched: [7:1.00]
   15278   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15279   ret <8 x double> %res
   15280 }
   15281 define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   15282 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0:
   15283 ; GENERIC:       # %bb.0:
   15284 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   15285 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15286 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   15287 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15288 ;
   15289 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
   15290 ; SKX:       # %bb.0:
   15291 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   15292 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15293 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   15294 ; SKX-NEXT:    retq # sched: [7:1.00]
   15295   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15296   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15297   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   15298   ret <8 x double> %res
   15299 }
   15300 
   15301 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   15302 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
   15303 ; GENERIC:       # %bb.0:
   15304 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   15305 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15306 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15307 ;
   15308 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
   15309 ; SKX:       # %bb.0:
   15310 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   15311 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15312 ; SKX-NEXT:    retq # sched: [7:1.00]
   15313   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15314   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15315   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   15316   ret <8 x double> %res
   15317 }
   15318 define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   15319 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1:
   15320 ; GENERIC:       # %bb.0:
   15321 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   15322 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15323 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   15324 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15325 ;
   15326 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
   15327 ; SKX:       # %bb.0:
   15328 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   15329 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15330 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   15331 ; SKX-NEXT:    retq # sched: [7:1.00]
   15332   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15333   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15334   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   15335   ret <8 x double> %res
   15336 }
   15337 
   15338 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   15339 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
   15340 ; GENERIC:       # %bb.0:
   15341 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   15342 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15343 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15344 ;
   15345 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
   15346 ; SKX:       # %bb.0:
   15347 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   15348 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15349 ; SKX-NEXT:    retq # sched: [7:1.00]
   15350   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15351   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15352   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   15353   ret <8 x double> %res
   15354 }
   15355 define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   15356 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2:
   15357 ; GENERIC:       # %bb.0:
   15358 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   15359 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15360 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   15361 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15362 ;
   15363 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
   15364 ; SKX:       # %bb.0:
   15365 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   15366 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15367 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   15368 ; SKX-NEXT:    retq # sched: [7:1.00]
   15369   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15370   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15371   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   15372   ret <8 x double> %res
   15373 }
   15374 
   15375 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   15376 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
   15377 ; GENERIC:       # %bb.0:
   15378 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   15379 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15380 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15381 ;
   15382 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
   15383 ; SKX:       # %bb.0:
   15384 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   15385 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15386 ; SKX-NEXT:    retq # sched: [7:1.00]
   15387   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15388   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15389   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   15390   ret <8 x double> %res
   15391 }
   15392 define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
   15393 ; GENERIC-LABEL: test_8xdouble_unpack_high_mask3:
   15394 ; GENERIC:       # %bb.0:
   15395 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15396 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15397 ;
   15398 ; SKX-LABEL: test_8xdouble_unpack_high_mask3:
   15399 ; SKX:       # %bb.0:
   15400 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15401 ; SKX-NEXT:    retq # sched: [7:1.00]
   15402   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15403   ret <8 x double> %res
   15404 }
   15405 define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
   15406 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3:
   15407 ; GENERIC:       # %bb.0:
   15408 ; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
   15409 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15410 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
   15411 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15412 ;
   15413 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
   15414 ; SKX:       # %bb.0:
   15415 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
   15416 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15417 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
   15418 ; SKX-NEXT:    retq # sched: [7:1.00]
   15419   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15420   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15421   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   15422   ret <8 x double> %res
   15423 }
   15424 
   15425 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
   15426 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
   15427 ; GENERIC:       # %bb.0:
   15428 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   15429 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15430 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15431 ;
   15432 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
   15433 ; SKX:       # %bb.0:
   15434 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   15435 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
   15436 ; SKX-NEXT:    retq # sched: [7:1.00]
   15437   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15438   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15439   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   15440   ret <8 x double> %res
   15441 }
   15442 define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
   15443 ; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask0:
   15444 ; GENERIC:       # %bb.0:
   15445 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15446 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15447 ;
   15448 ; SKX-LABEL: test_8xdouble_unpack_high_mem_mask0:
   15449 ; SKX:       # %bb.0:
   15450 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15451 ; SKX-NEXT:    retq # sched: [7:1.00]
   15452   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15453   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15454   ret <8 x double> %res
   15455 }
   15456 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   15457 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
   15458 ; GENERIC:       # %bb.0:
   15459 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   15460 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15461 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   15462 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15463 ;
   15464 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
   15465 ; SKX:       # %bb.0:
   15466 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   15467 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15468 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   15469 ; SKX-NEXT:    retq # sched: [7:1.00]
   15470   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15471   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15472   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15473   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   15474   ret <8 x double> %res
   15475 }
   15476 
   15477 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   15478 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
   15479 ; GENERIC:       # %bb.0:
   15480 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   15481 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15482 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15483 ;
   15484 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
   15485 ; SKX:       # %bb.0:
   15486 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   15487 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15488 ; SKX-NEXT:    retq # sched: [7:1.00]
   15489   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15490   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15491   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15492   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   15493   ret <8 x double> %res
   15494 }
   15495 
   15496 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   15497 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
   15498 ; GENERIC:       # %bb.0:
   15499 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   15500 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15501 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   15502 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15503 ;
   15504 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
   15505 ; SKX:       # %bb.0:
   15506 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   15507 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15508 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   15509 ; SKX-NEXT:    retq # sched: [7:1.00]
   15510   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15511   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15512   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15513   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   15514   ret <8 x double> %res
   15515 }
   15516 
   15517 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   15518 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
   15519 ; GENERIC:       # %bb.0:
   15520 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   15521 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15522 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15523 ;
   15524 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
   15525 ; SKX:       # %bb.0:
   15526 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   15527 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15528 ; SKX-NEXT:    retq # sched: [7:1.00]
   15529   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15530   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15531   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15532   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   15533   ret <8 x double> %res
   15534 }
   15535 
   15536 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   15537 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
   15538 ; GENERIC:       # %bb.0:
   15539 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   15540 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15541 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   15542 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15543 ;
   15544 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
   15545 ; SKX:       # %bb.0:
   15546 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   15547 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15548 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   15549 ; SKX-NEXT:    retq # sched: [7:1.00]
   15550   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15551   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15552   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15553   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   15554   ret <8 x double> %res
   15555 }
   15556 
   15557 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   15558 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
   15559 ; GENERIC:       # %bb.0:
   15560 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   15561 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15562 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15563 ;
   15564 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
   15565 ; SKX:       # %bb.0:
   15566 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   15567 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15568 ; SKX-NEXT:    retq # sched: [7:1.00]
   15569   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15570   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15571   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15572   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   15573   ret <8 x double> %res
   15574 }
   15575 
   15576 define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
   15577 ; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask3:
   15578 ; GENERIC:       # %bb.0:
   15579 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15580 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15581 ;
   15582 ; SKX-LABEL: test_8xdouble_unpack_high_mem_mask3:
   15583 ; SKX:       # %bb.0:
   15584 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15585 ; SKX-NEXT:    retq # sched: [7:1.00]
   15586   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15587   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15588   ret <8 x double> %res
   15589 }
   15590 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
   15591 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
   15592 ; GENERIC:       # %bb.0:
   15593 ; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
   15594 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15595 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
   15596 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15597 ;
   15598 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
   15599 ; SKX:       # %bb.0:
   15600 ; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
   15601 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15602 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
   15603 ; SKX-NEXT:    retq # sched: [7:1.00]
   15604   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15605   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15606   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15607   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   15608   ret <8 x double> %res
   15609 }
   15610 
   15611 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
   15612 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
   15613 ; GENERIC:       # %bb.0:
   15614 ; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
   15615 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15616 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   15617 ;
   15618 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
   15619 ; SKX:       # %bb.0:
   15620 ; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
   15621 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
   15622 ; SKX-NEXT:    retq # sched: [7:1.00]
   15623   %vec2 = load <8 x double>, <8 x double>* %vec2p
   15624   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   15625   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   15626   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   15627   ret <8 x double> %res
   15628 }
   15629 
   15630