Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
      3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
      4 
      5 target triple = "x86_64-unknown-unknown"
      6 
      7 define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x float> %a, <16 x float> %b) {
      8 ; ALL-LABEL: shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
      9 ; ALL:       # %bb.0:
     10 ; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
     11 ; ALL-NEXT:    retq
     12   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
     13   ret <16 x float> %shuffle
     14 }
     15 
     16 define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
     17 ; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
     18 ; ALL:       # %bb.0:
     19 ; ALL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
     20 ; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
     21 ; ALL-NEXT:    retq
     22   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
     23   ret <16 x float> %shuffle
     24 }
     25 
     26 define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
     27 ; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
     28 ; ALL:       # %bb.0:
     29 ; ALL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
     30 ; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
     31 ; ALL-NEXT:    retq
     32   %tmp0 = bitcast <16 x i32> %a to <16 x float>
     33   %tmp1 = bitcast <16 x i32> %b to <16 x float>
     34   %shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
     35   ret <16 x float> %shuffle
     36 }
     37 
     38 define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) {
     39 ; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
     40 ; ALL:       # %bb.0:
     41 ; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
     42 ; ALL-NEXT:    retq
     43   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
     44   ret <16 x float> %shuffle
     45 }
     46 
     47 define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz(<16 x float> %a, <16 x float> %b) {
     48 ; ALL-LABEL: shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz:
     49 ; ALL:       # %bb.0:
     50 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
     51 ; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
     52 ; ALL-NEXT:    retq
     53   %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32><i32 0, i32 16, i32 1, i32 16, i32 4, i32 16, i32 5, i32 16, i32 8, i32 16, i32 9, i32 16, i32 12, i32 16, i32 13, i32 16>
     54   ret <16 x float> %shuffle
     55 }
     56 
     57 define <16 x float> @shuffle_v16f32_vunpcklps_swap(<16 x float> %a, <16 x float> %b) {
     58 ; ALL-LABEL: shuffle_v16f32_vunpcklps_swap:
     59 ; ALL:       # %bb.0:
     60 ; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13]
     61 ; ALL-NEXT:    retq
     62   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 20, i32 4, i32 21, i32 5, i32 24, i32 8, i32 25, i32 9, i32 28, i32 12, i32 29, i32 13>
     63   ret <16 x float> %shuffle
     64 }
     65 
     66 ; PR34382
     67 define <16 x float> @shuffle_v16f32_01_01_03_00_06_04_05_07_08_08_09_09_15_14_14_12(<16 x float> %a0) {
     68 ; ALL-LABEL: shuffle_v16f32_01_01_03_00_06_04_05_07_08_08_09_09_15_14_14_12:
     69 ; ALL:       # %bb.0:
     70 ; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,1,3,0,6,4,5,7,8,8,9,9,15,14,14,12]
     71 ; ALL-NEXT:    retq
     72   %shuffle = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 0, i32 6, i32 4, i32 5, i32 7, i32 8, i32 8, i32 9, i32 9, i32 15, i32 14, i32 14, i32 12>
     73   ret <16 x float> %shuffle
     74 }
     75 
     76 define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x i32> %a, <16 x i32> %b) {
     77 ; ALL-LABEL: shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
     78 ; ALL:       # %bb.0:
     79 ; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
     80 ; ALL-NEXT:    retq
     81   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
     82   ret <16 x i32> %shuffle
     83 }
     84 
     85 define <16 x i32> @shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d(<16 x i32> %a, <16 x i32> %b) {
     86 ; ALL-LABEL: shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d:
     87 ; ALL:       # %bb.0:
     88 ; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     89 ; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
     90 ; ALL-NEXT:    retq
     91   %shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %b, <16 x i32><i32 15, i32 16, i32 13, i32 17, i32 11, i32 20, i32 9, i32 21, i32 7, i32 24, i32 5, i32 25, i32 3, i32 28, i32 1, i32 29>
     92   ret <16 x i32> %shuffle
     93 }
     94 
     95 define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x float> %a, <16 x float> %b) {
     96 ; ALL-LABEL: shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
     97 ; ALL:       # %bb.0:
     98 ; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
     99 ; ALL-NEXT:    retq
    100   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
    101   ret <16 x float> %shuffle
    102 }
    103 
    104 define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f(<16 x float> %a, <16 x float> %b) {
    105 ; ALL-LABEL: shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f:
    106 ; ALL:       # %bb.0:
    107 ; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    108 ; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    109 ; ALL-NEXT:    retq
    110   %shuffle = shufflevector <16 x float> zeroinitializer, <16 x float> %b, <16 x i32><i32 0, i32 18, i32 0, i32 19, i32 4, i32 22, i32 4, i32 23, i32 6, i32 26, i32 6, i32 27, i32 8, i32 30, i32 8, i32 31>
    111   ret <16 x float> %shuffle
    112 }
    113 
    114 define <16 x float> @shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x float> %a, <16 x float> %b) {
    115 ; ALL-LABEL: shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
    116 ; ALL:       # %bb.0:
    117 ; ALL-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
    118 ; ALL-NEXT:    retq
    119   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
    120   ret <16 x float> %shuffle
    121 }
    122 
    123 define <16 x float> @shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x float> %a, <16 x float> %b) {
    124 ; ALL-LABEL: shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
    125 ; ALL:       # %bb.0:
    126 ; ALL-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    127 ; ALL-NEXT:    retq
    128   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
    129   ret <16 x float> %shuffle
    130 }
    131 
    132 define <16 x float> @shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13(<16 x float> %a, <16 x float> %b) {
    133 ; ALL-LABEL: shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13:
    134 ; ALL:       # %bb.0:
    135 ; ALL-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,0,3,3,4,5,6,6]
    136 ; ALL-NEXT:    retq
    137   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 12, i32 13>
    138   ret <16 x float> %shuffle
    139 }
    140 
    141 define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12(<16 x float> %a, <16 x float> %b) {
    142 ; ALL-LABEL: shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12:
    143 ; ALL:       # %bb.0:
    144 ; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12]
    145 ; ALL-NEXT:    retq
    146   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4, i32 8, i32 8, i32 10, i32 8, i32 12, i32 12, i32 14, i32 12>
    147   ret <16 x float> %shuffle
    148 }
    149 
    150 define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12(<16 x float> %a, <16 x float> %b) {
    151 ; ALL-LABEL: shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12:
    152 ; ALL:       # %bb.0:
    153 ; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12]
    154 ; ALL-NEXT:    retq
    155   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12>
    156   ret <16 x float> %shuffle
    157 }
    158 
    159 define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i32> %a, <16 x i32> %b) {
    160 ; ALL-LABEL: shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
    161 ; ALL:       # %bb.0:
    162 ; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
    163 ; ALL-NEXT:    retq
    164   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    165   ret <16 x i32> %shuffle
    166 }
    167 
    168 define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
    169 ; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
    170 ; ALL:       # %bb.0:
    171 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
    172 ; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
    173 ; ALL-NEXT:    retq
    174   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
    175   ret <16 x i32> %shuffle
    176 }
    177 
    178 define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) {
    179 ; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
    180 ; ALL:       # %bb.0:
    181 ; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    182 ; ALL-NEXT:    retq
    183   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
    184   ret <16 x i32> %shuffle
    185 }
    186 
    187 define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz(<16 x i32> %a, <16 x i32> %b) {
    188 ; ALL-LABEL: shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz:
    189 ; ALL:       # %bb.0:
    190 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    191 ; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    192 ; ALL-NEXT:    retq
    193   %shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32><i32 2, i32 30, i32 3, i32 28, i32 6, i32 26, i32 7, i32 24, i32 10, i32 22, i32 11, i32 20, i32 14, i32 18, i32 15, i32 16>
    194   ret <16 x i32> %shuffle
    195 }
    196 
    197 define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28(<16 x i32> %a, <16 x i32> %b) {
    198 ; AVX512F-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
    199 ; AVX512F:       # %bb.0:
    200 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
    201 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
    202 ; AVX512F-NEXT:    retq
    203 ;
    204 ; AVX512BW-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
    205 ; AVX512BW:       # %bb.0:
    206 ; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3],zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19],zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35],zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51]
    207 ; AVX512BW-NEXT:    retq
    208   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 16, i32 5, i32 6, i32 7, i32 20, i32 9, i32 10, i32 11, i32 24, i32 13, i32 14, i32 15, i32 28>
    209   ret <16 x i32> %shuffle
    210 }
    211 
    212 define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a)  {
    213 ; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
    214 ; ALL:       # %bb.0:
    215 ; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
    216 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
    217 ; ALL-NEXT:    retq
    218   %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
    219   ret <16 x float> %c
    220 }
    221 
    222 define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a)  {
    223 ; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
    224 ; ALL:       # %bb.0:
    225 ; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
    226 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
    227 ; ALL-NEXT:    retq
    228   %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
    229   ret <16 x i32> %c
    230 }
    231 
    232 define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32> %b)  {
    233 ; ALL-LABEL: shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
    234 ; ALL:       # %bb.0:
    235 ; ALL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
    236 ; ALL-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
    237 ; ALL-NEXT:    retq
    238   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
    239   ret <16 x i32> %c
    240 }
    241 
    242 define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float> %b)  {
    243 ; ALL-LABEL: shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
    244 ; ALL:       # %bb.0:
    245 ; ALL-NEXT:    vmovaps {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
    246 ; ALL-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
    247 ; ALL-NEXT:    retq
    248   %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
    249   ret <16 x float> %c
    250 }
    251 
    252 define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float>* %b)  {
    253 ; ALL-LABEL: shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
    254 ; ALL:       # %bb.0:
    255 ; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
    256 ; ALL-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
    257 ; ALL-NEXT:    retq
    258   %c = load <16 x float>, <16 x float>* %b
    259   %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
    260   ret <16 x float> %d
    261 }
    262 
    263 define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32>* %b)  {
    264 ; ALL-LABEL: shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
    265 ; ALL:       # %bb.0:
    266 ; ALL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
    267 ; ALL-NEXT:    vpermt2d (%rdi), %zmm1, %zmm0
    268 ; ALL-NEXT:    retq
    269   %c = load <16 x i32>, <16 x i32>* %b
    270   %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
    271   ret <16 x i32> %d
    272 }
    273 
    274 define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b)  {
    275 ; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
    276 ; ALL:       # %bb.0:
    277 ; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
    278 ; ALL-NEXT:    retq
    279   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    280   ret <16 x i32> %c
    281 }
    282 
    283 ;FIXME: can do better with vpcompress
    284 define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
    285 ; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
    286 ; ALL:       # %bb.0:
    287 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    288 ; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
    289 ; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
    290 ; ALL-NEXT:    retq
    291   %res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    292   ret <8 x i32> %res
    293 }
    294 
    295 ;FIXME: can do better with vpcompress
    296 define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
    297 ; ALL-LABEL: test_v16i32_0_1_2_12:
    298 ; ALL:       # %bb.0:
    299 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    300 ; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
    301 ; ALL-NEXT:    vbroadcastss %xmm1, %xmm1
    302 ; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
    303 ; ALL-NEXT:    vzeroupper
    304 ; ALL-NEXT:    retq
    305   %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
    306   ret <4 x i32> %res
    307 }
    308 
    309 define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
    310 ; ALL-LABEL: shuffle_v16f32_extract_256:
    311 ; ALL:       # %bb.0:
    312 ; ALL-NEXT:    vmovups 32(%rsi), %ymm0
    313 ; ALL-NEXT:    retq
    314   %ptr_a = bitcast float* %a to <16 x float>*
    315   %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4
    316   %v2 = shufflevector <16 x float> %v_a, <16 x float> undef, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    317   ret <8 x float> %v2
    318 }
    319 
    320 ;FIXME: can do better with vcompressp
    321 define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
    322 ; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
    323 ; ALL:       # %bb.0:
    324 ; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    325 ; ALL-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
    326 ; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
    327 ; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,u]
    328 ; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
    329 ; ALL-NEXT:    retq
    330   %res = shufflevector <16 x float> %v, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 10>
    331   ret <8 x float> %res
    332 }
    333 
    334 ;FIXME: can do better with vcompressp
    335 define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
    336 ; ALL-LABEL: test_v16f32_0_1_3_6:
    337 ; ALL:       # %bb.0:
    338 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,3,3]
    339 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
    340 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
    341 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    342 ; ALL-NEXT:    vzeroupper
    343 ; ALL-NEXT:    retq
    344   %res = shufflevector <16 x float> %v, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 6>
    345   ret <4 x float> %res
    346 }
    347 
    348 define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b)  {
    349 ; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12:
    350 ; ALL:       # %bb.0:
    351 ; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
    352 ; ALL-NEXT:    retq
    353   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
    354   ret <16 x i32> %c
    355 }
    356 
    357 define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b)  {
    358 ; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12:
    359 ; ALL:       # %bb.0:
    360 ; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
    361 ; ALL-NEXT:    retq
    362   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
    363   ret <16 x i32> %c
    364 }
    365 
    366 define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c(<16 x float> %a, <16 x float> %b) {
    367 ; ALL-LABEL: shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c:
    368 ; ALL:       # %bb.0:
    369 ; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,1],zmm1[0,0],zmm0[4,5],zmm1[4,4],zmm0[8,9],zmm1[8,8],zmm0[12,13],zmm1[12,12]
    370 ; ALL-NEXT:    retq
    371   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
    372   ret <16 x float> %shuffle
    373 }
    374 
    375 define <16 x i32> @insert_mem_and_zero_v16i32(i32* %ptr) {
    376 ; ALL-LABEL: insert_mem_and_zero_v16i32:
    377 ; ALL:       # %bb.0:
    378 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    379 ; ALL-NEXT:    retq
    380   %a = load i32, i32* %ptr
    381   %v = insertelement <16 x i32> undef, i32 %a, i32 0
    382   %shuffle = shufflevector <16 x i32> %v, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    383   ret <16 x i32> %shuffle
    384 }
    385 
    386 
    387 define <16 x i32> @shuffle_v16i32_0zzzzzzzzzzzzzzz(<16 x i32> %a) {
    388 ; ALL-LABEL: shuffle_v16i32_0zzzzzzzzzzzzzzz:
    389 ; ALL:       # %bb.0:
    390 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    391 ; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    392 ; ALL-NEXT:    retq
    393   %shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    394   ret <16 x i32> %shuffle
    395 }
    396 
    397 define <16 x float> @shuffle_v16f32_0zzzzzzzzzzzzzzz(<16 x float> %a) {
    398 ; ALL-LABEL: shuffle_v16f32_0zzzzzzzzzzzzzzz:
    399 ; ALL:       # %bb.0:
    400 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    401 ; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
    402 ; ALL-NEXT:    retq
    403   %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    404   ret <16 x float> %shuffle
    405 }
    406 
    407 define <16 x i32> @shuffle_v16i32_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_23_zz(<16 x i32> %a) {
    408 ; ALL-LABEL: shuffle_v16i32_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_23_zz:
    409 ; ALL:       # %bb.0:
    410 ; ALL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
    411 ; ALL-NEXT:    retq
    412   %shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %a, <16 x i32> <i32 16, i32 0, i32 17, i32 0, i32 18, i32 0, i32 19, i32 0, i32 20, i32 0, i32 21, i32 0, i32 22, i32 0, i32 23, i32 0>
    413   ret <16 x i32> %shuffle
    414 }
    415 
    416 define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b) {
    417 ; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
    418 ; ALL:       # %bb.0:
    419 ; ALL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
    420 ; ALL-NEXT:    retq
    421   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
    422   ret <16 x i32> %shuffle
    423 }
    424 
    425 define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i32> %a) {
    426 ; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
    427 ; ALL:       # %bb.0:
    428 ; ALL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
    429 ; ALL-NEXT:    retq
    430   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
    431   ret <16 x i32> %shuffle
    432 }
    433 
    434 define <16 x i32> @shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_31(<16 x i32> %a, <16 x i32> %b) {
    435 ; ALL-LABEL: shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_31:
    436 ; ALL:       # %bb.0:
    437 ; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,3],zmm0[4,7],zmm1[4,7],zmm0[8,11],zmm1[8,11],zmm0[12,15],zmm1[12,15]
    438 ; ALL-NEXT:    retq
    439   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 3, i32 16, i32 19, i32 4, i32 7, i32 20, i32 23, i32 8, i32 11, i32 24, i32 27, i32 12, i32 15, i32 28, i32 31>
    440   ret <16 x i32> %shuffle
    441 }
    442 
    443 define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_uu(<16 x i32> %a, <16 x i32> %b) {
    444 ; ALL-LABEL: shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_uu:
    445 ; ALL:       # %bb.0:
    446 ; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm1[0,0],zmm0[2,3],zmm1[4,4],zmm0[6,7],zmm1[8,8],zmm0[10,11],zmm1[12,12],zmm0[14,15]
    447 ; ALL-NEXT:    retq
    448   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 16, i32 16, i32 02, i32 03, i32 20, i32 20, i32 06, i32 07, i32 24, i32 24, i32 10, i32 11, i32 28, i32 28, i32 undef, i32 undef>
    449   ret <16 x i32> %shuffle
    450 }
    451 
    452 define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12(<16 x i32> %a, <16 x i32> %b) {
    453 ; ALL-LABEL: shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12:
    454 ; ALL:       # %bb.0:
    455 ; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm1[1,0],zmm0[1,0],zmm1[5,4],zmm0[5,4],zmm1[9,8],zmm0[9,8],zmm1[13,12],zmm0[13,12]
    456 ; ALL-NEXT:    retq
    457   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 17, i32 16, i32 01, i32 00, i32 21, i32 20, i32 05, i32 04, i32 25, i32 24, i32 09, i32 08, i32 29, i32 28, i32 13, i32 12>
    458   ret <16 x i32> %shuffle
    459 }
    460 
    461 define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
    462 ; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
    463 ; ALL:       # %bb.0:
    464 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
    465 ; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
    466 ; ALL-NEXT:    retq
    467   %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
    468   ret <16 x float> %shuffle
    469 }
    470 
    471 define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, <16 x i32> %passthru, i16 %mask) {
    472 ; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
    473 ; AVX512F:       # %bb.0:
    474 ; AVX512F-NEXT:    kmovw %edi, %k1
    475 ; AVX512F-NEXT:    valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
    476 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
    477 ; AVX512F-NEXT:    retq
    478 ;
    479 ; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
    480 ; AVX512BW:       # %bb.0:
    481 ; AVX512BW-NEXT:    kmovd %edi, %k1
    482 ; AVX512BW-NEXT:    valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
    483 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
    484 ; AVX512BW-NEXT:    retq
    485   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
    486   %mask.cast = bitcast i16 %mask to <16 x i1>
    487   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
    488   ret <16 x i32> %res
    489 }
    490 
    491 define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
    492 ; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
    493 ; AVX512F:       # %bb.0:
    494 ; AVX512F-NEXT:    kmovw %edi, %k1
    495 ; AVX512F-NEXT:    valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
    496 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
    497 ; AVX512F-NEXT:    retq
    498 ;
    499 ; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
    500 ; AVX512BW:       # %bb.0:
    501 ; AVX512BW-NEXT:    kmovd %edi, %k1
    502 ; AVX512BW-NEXT:    valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
    503 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
    504 ; AVX512BW-NEXT:    retq
    505   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
    506   %mask.cast = bitcast i16 %mask to <16 x i1>
    507   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
    508   ret <16 x i32> %res
    509 }
    510 
    511 define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, i16 %mask) {
    512 ; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
    513 ; AVX512F:       # %bb.0:
    514 ; AVX512F-NEXT:    kmovw %edi, %k1
    515 ; AVX512F-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
    516 ; AVX512F-NEXT:    retq
    517 ;
    518 ; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
    519 ; AVX512BW:       # %bb.0:
    520 ; AVX512BW-NEXT:    kmovd %edi, %k1
    521 ; AVX512BW-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
    522 ; AVX512BW-NEXT:    retq
    523   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
    524   %mask.cast = bitcast i16 %mask to <16 x i1>
    525   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> zeroinitializer
    526   ret <16 x i32> %res
    527 }
    528 
    529 define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
    530 ; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
    531 ; AVX512F:       # %bb.0:
    532 ; AVX512F-NEXT:    kmovw %edi, %k1
    533 ; AVX512F-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
    534 ; AVX512F-NEXT:    retq
    535 ;
    536 ; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
    537 ; AVX512BW:       # %bb.0:
    538 ; AVX512BW-NEXT:    kmovd %edi, %k1
    539 ; AVX512BW-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
    540 ; AVX512BW-NEXT:    retq
    541   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
    542   %mask.cast = bitcast i16 %mask to <16 x i1>
    543   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> zeroinitializer
    544   ret <16 x i32> %res
    545 }
    546 
    547 define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) nounwind {
    548 ; ALL-LABEL: test_vshuff32x4_512:
    549 ; ALL:       # %bb.0:
    550 ; ALL-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
    551 ; ALL-NEXT:    retq
    552   %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
    553   ret <16 x float> %res
    554 }
    555 
    556 define <16 x i32> @test_vshufi32x4_512(<16 x i32> %x, <16 x i32> %x1) nounwind {
    557 ; ALL-LABEL: test_vshufi32x4_512:
    558 ; ALL:       # %bb.0:
    559 ; ALL-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
    560 ; ALL-NEXT:    retq
    561   %res = shufflevector <16 x i32> %x, <16 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
    562   ret <16 x i32> %res
    563 }
    564 
    565 define <16 x float> @test_vshuff32x4_512_mask(<16 x float> %x, <16 x float> %x1, <16 x float> %y, <16 x i1> %mask) nounwind {
    566 ; AVX512F-LABEL: test_vshuff32x4_512_mask:
    567 ; AVX512F:       # %bb.0:
    568 ; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
    569 ; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
    570 ; AVX512F-NEXT:    vpmovd2m %zmm3, %k1
    571 ; AVX512F-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
    572 ; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
    573 ; AVX512F-NEXT:    retq
    574 ;
    575 ; AVX512BW-LABEL: test_vshuff32x4_512_mask:
    576 ; AVX512BW:       # %bb.0:
    577 ; AVX512BW-NEXT:    vpsllw $7, %xmm3, %xmm3
    578 ; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
    579 ; AVX512BW-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
    580 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
    581 ; AVX512BW-NEXT:    retq
    582   %x2 = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
    583   %res = select <16 x i1> %mask, <16 x float> %x2, <16 x float> %y
    584   ret <16 x float> %res
    585 }
    586 
    587 define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x i32> %y, <16 x i1> %mask) nounwind {
    588 ; AVX512F-LABEL: test_vshufi32x4_512_mask:
    589 ; AVX512F:       # %bb.0:
    590 ; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
    591 ; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
    592 ; AVX512F-NEXT:    vpmovd2m %zmm3, %k1
    593 ; AVX512F-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
    594 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
    595 ; AVX512F-NEXT:    retq
    596 ;
    597 ; AVX512BW-LABEL: test_vshufi32x4_512_mask:
    598 ; AVX512BW:       # %bb.0:
    599 ; AVX512BW-NEXT:    vpsllw $7, %xmm3, %xmm3
    600 ; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
    601 ; AVX512BW-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
    602 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
    603 ; AVX512BW-NEXT:    retq
    604   %x2 = shufflevector <16 x i32> %x, <16 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
    605   %res = select <16 x i1> %mask, <16 x i32> %x2, <16 x i32> %y
    606   ret <16 x i32> %res
    607 }
    608 
    609 define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
    610 ; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
    611 ; AVX512F:       # %bb.0:
    612 ; AVX512F-NEXT:    kmovw %edi, %k1
    613 ; AVX512F-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
    614 ; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
    615 ; AVX512F-NEXT:    retq
    616 ;
    617 ; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
    618 ; AVX512BW:       # %bb.0:
    619 ; AVX512BW-NEXT:    kmovd %edi, %k1
    620 ; AVX512BW-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
    621 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
    622 ; AVX512BW-NEXT:    retq
    623   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
    624   %mask.cast = bitcast i16 %mask to <16 x i1>
    625   %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru
    626   ret <16 x float> %res
    627 }
    628 
    629 define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
    630 ; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
    631 ; AVX512F:       # %bb.0:
    632 ; AVX512F-NEXT:    kmovw %edi, %k1
    633 ; AVX512F-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
    634 ; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
    635 ; AVX512F-NEXT:    retq
    636 ;
    637 ; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
    638 ; AVX512BW:       # %bb.0:
    639 ; AVX512BW-NEXT:    kmovd %edi, %k1
    640 ; AVX512BW-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
    641 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
    642 ; AVX512BW-NEXT:    retq
    643   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    644   %mask.cast = bitcast i16 %mask to <16 x i1>
    645   %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru
    646   ret <16 x float> %res
    647 }
    648 
    649 define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
    650 ; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
    651 ; AVX512F:       # %bb.0:
    652 ; AVX512F-NEXT:    kmovw %edi, %k1
    653 ; AVX512F-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
    654 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
    655 ; AVX512F-NEXT:    retq
    656 ;
    657 ; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
    658 ; AVX512BW:       # %bb.0:
    659 ; AVX512BW-NEXT:    kmovd %edi, %k1
    660 ; AVX512BW-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
    661 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
    662 ; AVX512BW-NEXT:    retq
    663   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
    664   %mask.cast = bitcast i16 %mask to <16 x i1>
    665   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
    666   ret <16 x i32> %res
    667 }
    668 
    669 define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
    670 ; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
    671 ; AVX512F:       # %bb.0:
    672 ; AVX512F-NEXT:    kmovw %edi, %k1
    673 ; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
    674 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
    675 ; AVX512F-NEXT:    retq
    676 ;
    677 ; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
    678 ; AVX512BW:       # %bb.0:
    679 ; AVX512BW-NEXT:    kmovd %edi, %k1
    680 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
    681 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
    682 ; AVX512BW-NEXT:    retq
    683   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    684   %mask.cast = bitcast i16 %mask to <16 x i1>
    685   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
    686   ret <16 x i32> %res
    687 }
    688 
    689 define <16 x i32> @mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03(<4 x i32> %a) {
    690 ; ALL-LABEL: mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03:
    691 ; ALL:       # %bb.0:
    692 ; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    693 ; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    694 ; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
    695 ; ALL-NEXT:    retq
    696   %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    697   ret <16 x i32> %res
    698 }
    699 
    700 define <16 x float> @mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03(<4 x float> %a) {
    701 ; ALL-LABEL: mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03:
    702 ; ALL:       # %bb.0:
    703 ; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    704 ; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    705 ; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
    706 ; ALL-NEXT:    retq
    707   %res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    708   ret <16 x float> %res
    709 }
    710