Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VBMI
      6 
      7 define <64 x i8> @shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a)  {
      8 ; ALL-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
      9 ; ALL:       # %bb.0:
     10 ; ALL-NEXT:    vpsrld $16, %xmm0, %xmm0
     11 ; ALL-NEXT:    retq
     12   %b = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     13   ret <64 x i8> %b
     14 }
     15 
     16 define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
     17 ; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
     18 ; AVX512F:       # %bb.0:
     19 ; AVX512F-NEXT:    vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
     20 ; AVX512F-NEXT:    vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
     21 ; AVX512F-NEXT:    retq
     22 ;
     23 ; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
     24 ; AVX512BW:       # %bb.0:
     25 ; AVX512BW-NEXT:    vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
     26 ; AVX512BW-NEXT:    retq
     27 ;
     28 ; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
     29 ; AVX512DQ:       # %bb.0:
     30 ; AVX512DQ-NEXT:    vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
     31 ; AVX512DQ-NEXT:    vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
     32 ; AVX512DQ-NEXT:    retq
     33 ;
     34 ; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
     35 ; AVX512VBMI:       # %bb.0:
     36 ; AVX512VBMI-NEXT:    vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
     37 ; AVX512VBMI-NEXT:    retq
     38   %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
     39   ret <64 x i8> %shuffle
     40 }
     41 
     42 define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) {
     43 ; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
     44 ; AVX512F:       # %bb.0:
     45 ; AVX512F-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
     46 ; AVX512F-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
     47 ; AVX512F-NEXT:    retq
     48 ;
     49 ; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
     50 ; AVX512BW:       # %bb.0:
     51 ; AVX512BW-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
     52 ; AVX512BW-NEXT:    retq
     53 ;
     54 ; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
     55 ; AVX512DQ:       # %bb.0:
     56 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
     57 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
     58 ; AVX512DQ-NEXT:    retq
     59 ;
     60 ; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
     61 ; AVX512VBMI:       # %bb.0:
     62 ; AVX512VBMI-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
     63 ; AVX512VBMI-NEXT:    retq
     64   %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
     65   ret <64 x i8> %shuffle
     66 }
     67 
     68 define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
     69 ; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
     70 ; AVX512F:       # %bb.0:
     71 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
     72 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
     73 ; AVX512F-NEXT:    retq
     74 ;
     75 ; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
     76 ; AVX512BW:       # %bb.0:
     77 ; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
     78 ; AVX512BW-NEXT:    retq
     79 ;
     80 ; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
     81 ; AVX512DQ:       # %bb.0:
     82 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
     83 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
     84 ; AVX512DQ-NEXT:    retq
     85 ;
     86 ; AVX512VBMI-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
     87 ; AVX512VBMI:       # %bb.0:
     88 ; AVX512VBMI-NEXT:    vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
     89 ; AVX512VBMI-NEXT:    retq
     90   %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
     91   ret <64 x i8> %shuffle
     92 }
     93 
     94 
     95 define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
     96 ; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
     97 ; AVX512F:       # %bb.0:
     98 ; AVX512F-NEXT:    movl $255, %eax
     99 ; AVX512F-NEXT:    vmovd %eax, %xmm1
    100 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
    101 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    102 ; AVX512F-NEXT:    retq
    103 ;
    104 ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
    105 ; AVX512BW:       # %bb.0:
    106 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    107 ; AVX512BW-NEXT:    retq
    108 ;
    109 ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
    110 ; AVX512DQ:       # %bb.0:
    111 ; AVX512DQ-NEXT:    movl $255, %eax
    112 ; AVX512DQ-NEXT:    vmovd %eax, %xmm1
    113 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
    114 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    115 ; AVX512DQ-NEXT:    retq
    116 ;
    117 ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
    118 ; AVX512VBMI:       # %bb.0:
    119 ; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    120 ; AVX512VBMI-NEXT:    retq
    121   %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
    122   ret <64 x i8> %shuffle
    123 }
    124 
    125 define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<64 x i8> %a, <64 x i8> %b) {
    126 ; AVX512F-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
    127 ; AVX512F:       # %bb.0:
    128 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
    129 ; AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    130 ; AVX512F-NEXT:    retq
    131 ;
    132 ; AVX512BW-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
    133 ; AVX512BW:       # %bb.0:
    134 ; AVX512BW-NEXT:    vpbroadcastb %xmm0, %zmm0
    135 ; AVX512BW-NEXT:    retq
    136 ;
    137 ; AVX512DQ-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
    138 ; AVX512DQ:       # %bb.0:
    139 ; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
    140 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    141 ; AVX512DQ-NEXT:    retq
    142 ;
    143 ; AVX512VBMI-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
    144 ; AVX512VBMI:       # %bb.0:
    145 ; AVX512VBMI-NEXT:    vpbroadcastb %xmm0, %zmm0
    146 ; AVX512VBMI-NEXT:    retq
    147   %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    148   ret <64 x i8> %shuffle
    149 }
    150 
    151 define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) {
    152 ; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
    153 ; AVX512F:       # %bb.0:
    154 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
    155 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
    156 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
    157 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    158 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
    159 ; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
    160 ; AVX512F-NEXT:    retq
    161 ;
    162 ; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
    163 ; AVX512BW:       # %bb.0:
    164 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48]
    165 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
    166 ; AVX512BW-NEXT:    retq
    167 ;
    168 ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
    169 ; AVX512DQ:       # %bb.0:
    170 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
    171 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
    172 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
    173 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    174 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
    175 ; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm0
    176 ; AVX512DQ-NEXT:    retq
    177 ;
    178 ; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
    179 ; AVX512VBMI:       # %bb.0:
    180 ; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
    181 ; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
    182 ; AVX512VBMI-NEXT:    retq
    183   %shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
    184   ret <64 x i8> %shuffle
    185 }
    186 
    187 define <64 x i8> @insert_dup_mem_v64i8_i32(i32* %ptr) {
    188 ; AVX512F-LABEL: insert_dup_mem_v64i8_i32:
    189 ; AVX512F:       # %bb.0:
    190 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm0
    191 ; AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    192 ; AVX512F-NEXT:    retq
    193 ;
    194 ; AVX512BW-LABEL: insert_dup_mem_v64i8_i32:
    195 ; AVX512BW:       # %bb.0:
    196 ; AVX512BW-NEXT:    vpbroadcastb (%rdi), %zmm0
    197 ; AVX512BW-NEXT:    retq
    198 ;
    199 ; AVX512DQ-LABEL: insert_dup_mem_v64i8_i32:
    200 ; AVX512DQ:       # %bb.0:
    201 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm0
    202 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    203 ; AVX512DQ-NEXT:    retq
    204 ;
    205 ; AVX512VBMI-LABEL: insert_dup_mem_v64i8_i32:
    206 ; AVX512VBMI:       # %bb.0:
    207 ; AVX512VBMI-NEXT:    vpbroadcastb (%rdi), %zmm0
    208 ; AVX512VBMI-NEXT:    retq
    209   %tmp = load i32, i32* %ptr, align 4
    210   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
    211   %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
    212   %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> zeroinitializer
    213   ret <64 x i8> %tmp3
    214 }
    215 
    216 define <64 x i8> @insert_dup_mem_v64i8_sext_i8(i8* %ptr) {
    217 ; AVX512F-LABEL: insert_dup_mem_v64i8_sext_i8:
    218 ; AVX512F:       # %bb.0:
    219 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm0
    220 ; AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    221 ; AVX512F-NEXT:    retq
    222 ;
    223 ; AVX512BW-LABEL: insert_dup_mem_v64i8_sext_i8:
    224 ; AVX512BW:       # %bb.0:
    225 ; AVX512BW-NEXT:    vpbroadcastb (%rdi), %zmm0
    226 ; AVX512BW-NEXT:    retq
    227 ;
    228 ; AVX512DQ-LABEL: insert_dup_mem_v64i8_sext_i8:
    229 ; AVX512DQ:       # %bb.0:
    230 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm0
    231 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    232 ; AVX512DQ-NEXT:    retq
    233 ;
    234 ; AVX512VBMI-LABEL: insert_dup_mem_v64i8_sext_i8:
    235 ; AVX512VBMI:       # %bb.0:
    236 ; AVX512VBMI-NEXT:    vpbroadcastb (%rdi), %zmm0
    237 ; AVX512VBMI-NEXT:    retq
    238   %tmp = load i8, i8* %ptr, align 1
    239   %tmp1 = sext i8 %tmp to i32
    240   %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
    241   %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
    242   %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <64 x i32> zeroinitializer
    243   ret <64 x i8> %tmp4
    244 }
    245 
    246 define <64 x i8> @insert_dup_elt1_mem_v64i8_i32(i32* %ptr) {
    247 ; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_i32:
    248 ; AVX512F:       # %bb.0:
    249 ; AVX512F-NEXT:    vpbroadcastb 1(%rdi), %ymm0
    250 ; AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    251 ; AVX512F-NEXT:    retq
    252 ;
    253 ; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_i32:
    254 ; AVX512BW:       # %bb.0:
    255 ; AVX512BW-NEXT:    vpbroadcastb 1(%rdi), %zmm0
    256 ; AVX512BW-NEXT:    retq
    257 ;
    258 ; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_i32:
    259 ; AVX512DQ:       # %bb.0:
    260 ; AVX512DQ-NEXT:    vpbroadcastb 1(%rdi), %ymm0
    261 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    262 ; AVX512DQ-NEXT:    retq
    263 ;
    264 ; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_i32:
    265 ; AVX512VBMI:       # %bb.0:
    266 ; AVX512VBMI-NEXT:    vpbroadcastb 1(%rdi), %zmm0
    267 ; AVX512VBMI-NEXT:    retq
    268   %tmp = load i32, i32* %ptr, align 4
    269   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
    270   %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
    271   %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    272   ret <64 x i8> %tmp3
    273 }
    274 
    275 define <64 x i8> @insert_dup_elt3_mem_v64i8_i32(i32* %ptr) {
    276 ; AVX512F-LABEL: insert_dup_elt3_mem_v64i8_i32:
    277 ; AVX512F:       # %bb.0:
    278 ; AVX512F-NEXT:    vpbroadcastb 3(%rdi), %ymm0
    279 ; AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    280 ; AVX512F-NEXT:    retq
    281 ;
    282 ; AVX512BW-LABEL: insert_dup_elt3_mem_v64i8_i32:
    283 ; AVX512BW:       # %bb.0:
    284 ; AVX512BW-NEXT:    vpbroadcastb 3(%rdi), %zmm0
    285 ; AVX512BW-NEXT:    retq
    286 ;
    287 ; AVX512DQ-LABEL: insert_dup_elt3_mem_v64i8_i32:
    288 ; AVX512DQ:       # %bb.0:
    289 ; AVX512DQ-NEXT:    vpbroadcastb 3(%rdi), %ymm0
    290 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    291 ; AVX512DQ-NEXT:    retq
    292 ;
    293 ; AVX512VBMI-LABEL: insert_dup_elt3_mem_v64i8_i32:
    294 ; AVX512VBMI:       # %bb.0:
    295 ; AVX512VBMI-NEXT:    vpbroadcastb 3(%rdi), %zmm0
    296 ; AVX512VBMI-NEXT:    retq
    297   %tmp = load i32, i32* %ptr, align 4
    298   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
    299   %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
    300   %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    301   ret <64 x i8> %tmp3
    302 }
    303 
    304 define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
    305 ; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
    306 ; AVX512F:       # %bb.0:
    307 ; AVX512F-NEXT:    movsbl (%rdi), %eax
    308 ; AVX512F-NEXT:    shrl $8, %eax
    309 ; AVX512F-NEXT:    vmovd %eax, %xmm0
    310 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
    311 ; AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
    312 ; AVX512F-NEXT:    retq
    313 ;
    314 ; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
    315 ; AVX512BW:       # %bb.0:
    316 ; AVX512BW-NEXT:    movsbl (%rdi), %eax
    317 ; AVX512BW-NEXT:    shrl $8, %eax
    318 ; AVX512BW-NEXT:    vpbroadcastb %eax, %zmm0
    319 ; AVX512BW-NEXT:    retq
    320 ;
    321 ; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
    322 ; AVX512DQ:       # %bb.0:
    323 ; AVX512DQ-NEXT:    movsbl (%rdi), %eax
    324 ; AVX512DQ-NEXT:    shrl $8, %eax
    325 ; AVX512DQ-NEXT:    vmovd %eax, %xmm0
    326 ; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
    327 ; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
    328 ; AVX512DQ-NEXT:    retq
    329 ;
    330 ; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
    331 ; AVX512VBMI:       # %bb.0:
    332 ; AVX512VBMI-NEXT:    movsbl (%rdi), %eax
    333 ; AVX512VBMI-NEXT:    shrl $8, %eax
    334 ; AVX512VBMI-NEXT:    vpbroadcastb %eax, %zmm0
    335 ; AVX512VBMI-NEXT:    retq
    336   %tmp = load i8, i8* %ptr, align 1
    337   %tmp1 = sext i8 %tmp to i32
    338   %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
    339   %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
    340   %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <64 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    341   ret <64 x i8> %tmp4
    342 }
    343 
    344 define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) {
    345 ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
    346 ; AVX512F:       # %bb.0:
    347 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
    348 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    349 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
    350 ; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
    351 ; AVX512F-NEXT:    retq
    352 ;
    353 ; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
    354 ; AVX512BW:       # %bb.0:
    355 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
    356 ; AVX512BW-NEXT:    retq
    357 ;
    358 ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
    359 ; AVX512DQ:       # %bb.0:
    360 ; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
    361 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    362 ; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
    363 ; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm0
    364 ; AVX512DQ-NEXT:    retq
    365 ;
    366 ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
    367 ; AVX512VBMI:       # %bb.0:
    368 ; AVX512VBMI-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
    369 ; AVX512VBMI-NEXT:    retq
    370   %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 65, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 66, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 67, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 68, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 69, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 70, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 71, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    371   ret <64 x i8> %shuffle
    372 }
    373 
    374 define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) {
    375 ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
    376 ; AVX512F:       # %bb.0:
    377 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    378 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    379 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    380 ; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
    381 ; AVX512F-NEXT:    retq
    382 ;
    383 ; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
    384 ; AVX512BW:       # %bb.0:
    385 ; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    386 ; AVX512BW-NEXT:    retq
    387 ;
    388 ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
    389 ; AVX512DQ:       # %bb.0:
    390 ; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    391 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    392 ; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    393 ; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm0
    394 ; AVX512DQ-NEXT:    retq
    395 ;
    396 ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
    397 ; AVX512VBMI:       # %bb.0:
    398 ; AVX512VBMI-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    399 ; AVX512VBMI-NEXT:    retq
    400   %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 0, i32 0, i32 65, i32 0, i32 0, i32 0, i32 66, i32 0, i32 0, i32 0, i32 67, i32 0, i32 0, i32 0, i32 68, i32 0, i32 0, i32 0, i32 69, i32 0, i32 0, i32 0, i32 70, i32 0, i32 0, i32 0, i32 71, i32 0, i32 0, i32 0, i32 72, i32 0, i32 0, i32 0, i32 73, i32 0, i32 0, i32 0, i32 74, i32 0, i32 0, i32 0, i32 75, i32 0, i32 0, i32 0, i32 76, i32 0, i32 0, i32 0, i32 77, i32 0, i32 0, i32 0, i32 78, i32 0, i32 0, i32 0, i32 79, i32 0, i32 0, i32 0>
    401   ret <64 x i8> %shuffle
    402 }
    403 
    404 define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) {
    405 ; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
    406 ; AVX512F:       # %bb.0:
    407 ; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    408 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
    409 ; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    410 ; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
    411 ; AVX512F-NEXT:    retq
    412 ;
    413 ; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
    414 ; AVX512BW:       # %bb.0:
    415 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
    416 ; AVX512BW-NEXT:    retq
    417 ;
    418 ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
    419 ; AVX512DQ:       # %bb.0:
    420 ; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    421 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
    422 ; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    423 ; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm0
    424 ; AVX512DQ-NEXT:    retq
    425 ;
    426 ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
    427 ; AVX512VBMI:       # %bb.0:
    428 ; AVX512VBMI-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
    429 ; AVX512VBMI-NEXT:    retq
    430   %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 65, i32 0, i32 66, i32 0, i32 67, i32 0, i32 68, i32 0, i32 69, i32 0, i32 70, i32 0, i32 71, i32 0, i32 72, i32 0, i32 73, i32 0, i32 74, i32 0, i32 75, i32 0, i32 76, i32 0, i32 77, i32 0, i32 78, i32 0, i32 79, i32 0, i32 80, i32 0, i32 81, i32 0, i32 82, i32 0, i32 83, i32 0, i32 84, i32 0, i32 85, i32 0, i32 86, i32 0, i32 87, i32 0, i32 88, i32 0, i32 89, i32 0, i32 90, i32 0, i32 91, i32 0, i32 92, i32 0, i32 93, i32 0, i32 94, i32 0, i32 95, i32 0>
    431   ret <64 x i8> %shuffle
    432 }
    433 
    434 define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) {
    435 ; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
    436 ; AVX512F:       # %bb.0:
    437 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
    438 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
    439 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
    440 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
    441 ; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm2
    442 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    443 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
    444 ; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm1
    445 ; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
    446 ; AVX512F-NEXT:    retq
    447 ;
    448 ; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
    449 ; AVX512BW:       # %bb.0:
    450 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
    451 ; AVX512BW-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
    452 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
    453 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
    454 ; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
    455 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
    456 ; AVX512BW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
    457 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
    458 ; AVX512BW-NEXT:    vpand %ymm3, %ymm0, %ymm0
    459 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
    460 ; AVX512BW-NEXT:    retq
    461 ;
    462 ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
    463 ; AVX512DQ:       # %bb.0:
    464 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
    465 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
    466 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
    467 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
    468 ; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
    469 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    470 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
    471 ; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm1
    472 ; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm0
    473 ; AVX512DQ-NEXT:    retq
    474 ;
    475 ; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
    476 ; AVX512VBMI:       # %bb.0:
    477 ; AVX512VBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    478 ; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127]
    479 ; AVX512VBMI-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
    480 ; AVX512VBMI-NEXT:    retq
    481   %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 63, i32 64, i32 61, i32 64, i32 59, i32 64, i32 57, i32 64, i32 55, i32 64, i32 53, i32 64, i32 51, i32 64, i32 49, i32 64, i32 47, i32 64, i32 45, i32 64, i32 43, i32 64, i32 41, i32 64, i32 39, i32 64, i32 37, i32 64, i32 35, i32 64, i32 33, i32 64, i32 31, i32 64, i32 29, i32 64, i32 27, i32 64, i32 25, i32 64, i32 23, i32 64, i32 21, i32 64, i32 19, i32 64, i32 17, i32 64, i32 15, i32 64, i32 13, i32 64, i32 11, i32 64, i32 9, i32 64, i32 7, i32 64, i32 5, i32 64, i32 3, i32 64, i32 1, i32 64>
    482   ret <64 x i8> %shuffle
    483 }
    484 
    485 define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
    486 ; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
    487 ; AVX512F:       # %bb.0:
    488 ; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
    489 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
    490 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
    491 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
    492 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
    493 ; AVX512F-NEXT:    vpshufb %ymm5, %ymm1, %ymm2
    494 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
    495 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
    496 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
    497 ; AVX512F-NEXT:    vpshufb %ymm5, %ymm0, %ymm1
    498 ; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
    499 ; AVX512F-NEXT:    retq
    500 ;
    501 ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
    502 ; AVX512BW:       # %bb.0:
    503 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
    504 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
    505 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
    506 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
    507 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
    508 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
    509 ; AVX512BW-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
    510 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
    511 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
    512 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
    513 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
    514 ; AVX512BW-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
    515 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
    516 ; AVX512BW-NEXT:    retq
    517 ;
    518 ; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
    519 ; AVX512DQ:       # %bb.0:
    520 ; AVX512DQ-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
    521 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
    522 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
    523 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
    524 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
    525 ; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm1, %ymm2
    526 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
    527 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
    528 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
    529 ; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm0, %ymm1
    530 ; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm0
    531 ; AVX512DQ-NEXT:    retq
    532 ;
    533 ; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
    534 ; AVX512VBMI:       # %bb.0:
    535 ; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126]
    536 ; AVX512VBMI-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
    537 ; AVX512VBMI-NEXT:    retq
    538   %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 63, i32 64, i32 61, i32 66, i32 59, i32 68, i32 57, i32 70, i32 55, i32 72, i32 53, i32 74, i32 51, i32 76, i32 49, i32 78, i32 47, i32 80, i32 45, i32 82, i32 43, i32 84, i32 41, i32 86, i32 39, i32 88, i32 37, i32 90, i32 35, i32 92, i32 33, i32 94, i32 31, i32 96, i32 29, i32 98, i32 27, i32 100, i32 25, i32 102, i32 23, i32 104, i32 21, i32 106, i32 19, i32 108, i32 17, i32 110, i32 15, i32 112, i32 13, i32 114, i32 11, i32 116, i32 9, i32 118, i32 7, i32 120, i32 5, i32 122, i32 3, i32 124, i32 1, i32 126>
    539   ret <64 x i8> %shuffle
    540 }
    541 
    542 define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
    543 ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
    544 ; AVX512F:       # %bb.0:
    545 ; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
    546 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
    547 ; AVX512F-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
    548 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm1
    549 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
    550 ; AVX512F-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
    551 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
    552 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
    553 ; AVX512F-NEXT:    retq
    554 ;
    555 ; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
    556 ; AVX512BW:       # %bb.0:
    557 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
    558 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
    559 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
    560 ; AVX512BW-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    561 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
    562 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
    563 ; AVX512BW-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
    564 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
    565 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
    566 ; AVX512BW-NEXT:    retq
    567 ;
    568 ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
    569 ; AVX512DQ:       # %bb.0:
    570 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
    571 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
    572 ; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
    573 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm1
    574 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
    575 ; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
    576 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
    577 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
    578 ; AVX512DQ-NEXT:    retq
    579 ;
    580 ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
    581 ; AVX512VBMI:       # %bb.0:
    582 ; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127]
    583 ; AVX512VBMI-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
    584 ; AVX512VBMI-NEXT:    retq
    585   %1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    586   %2 = lshr <32 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    587   %3 = bitcast <32 x i16> %1 to <64 x i8>
    588   %4 = bitcast <32 x i16> %2 to <64 x i8>
    589   %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
    590   ret <64 x i8> %5
    591 }
    592