Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
      3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
      4 
      5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
      6 
      7 define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
      8 ; X86-LABEL: test_mm512_kunpackd:
      9 ; X86:       # %bb.0: # %entry
     10 ; X86-NEXT:    pushl %ebp
     11 ; X86-NEXT:    .cfi_def_cfa_offset 8
     12 ; X86-NEXT:    .cfi_offset %ebp, -8
     13 ; X86-NEXT:    movl %esp, %ebp
     14 ; X86-NEXT:    .cfi_def_cfa_register %ebp
     15 ; X86-NEXT:    andl $-64, %esp
     16 ; X86-NEXT:    subl $64, %esp
     17 ; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
     18 ; X86-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
     19 ; X86-NEXT:    vpcmpneqb 8(%ebp), %zmm2, %k1
     20 ; X86-NEXT:    vpcmpneqb 72(%ebp), %zmm3, %k2
     21 ; X86-NEXT:    kandd %k0, %k2, %k0
     22 ; X86-NEXT:    kmovd %k0, %eax
     23 ; X86-NEXT:    kshiftrq $32, %k2, %k0
     24 ; X86-NEXT:    kandd %k1, %k0, %k0
     25 ; X86-NEXT:    kmovd %k0, %edx
     26 ; X86-NEXT:    movl %ebp, %esp
     27 ; X86-NEXT:    popl %ebp
     28 ; X86-NEXT:    .cfi_def_cfa %esp, 4
     29 ; X86-NEXT:    vzeroupper
     30 ; X86-NEXT:    retl
     31 ;
     32 ; X64-LABEL: test_mm512_kunpackd:
     33 ; X64:       # %bb.0: # %entry
     34 ; X64-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
     35 ; X64-NEXT:    vpcmpneqb %zmm3, %zmm2, %k1
     36 ; X64-NEXT:    kunpckdq %k0, %k1, %k1
     37 ; X64-NEXT:    vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
     38 ; X64-NEXT:    kmovq %k0, %rax
     39 ; X64-NEXT:    vzeroupper
     40 ; X64-NEXT:    retq
     41 entry:
     42   %0 = bitcast <8 x i64> %__E to <64 x i8>
     43   %1 = bitcast <8 x i64> %__F to <64 x i8>
     44   %2 = bitcast <8 x i64> %__B to <64 x i8>
     45   %3 = bitcast <8 x i64> %__A to <64 x i8>
     46   %4 = icmp ne <64 x i8> %2, %3
     47   %5 = bitcast <8 x i64> %__C to <64 x i8>
     48   %6 = bitcast <8 x i64> %__D to <64 x i8>
     49   %7 = icmp ne <64 x i8> %5, %6
     50   %8 = shufflevector <64 x i1> %4, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
     51   %9 = shufflevector <64 x i1> %7, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
     52   %10 = shufflevector <32 x i1> %8, <32 x i1> %9, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
     53   %11 = icmp ne <64 x i8> %0, %1
     54   %12 = and <64 x i1> %11, %10
     55   %13 = bitcast <64 x i1> %12 to i64
     56   ret i64 %13
     57 }
     58 
     59 define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
     60 ; X86-LABEL: test_mm512_kunpackw:
     61 ; X86:       # %bb.0: # %entry
     62 ; X86-NEXT:    pushl %ebp
     63 ; X86-NEXT:    .cfi_def_cfa_offset 8
     64 ; X86-NEXT:    .cfi_offset %ebp, -8
     65 ; X86-NEXT:    movl %esp, %ebp
     66 ; X86-NEXT:    .cfi_def_cfa_register %ebp
     67 ; X86-NEXT:    andl $-64, %esp
     68 ; X86-NEXT:    subl $64, %esp
     69 ; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
     70 ; X86-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
     71 ; X86-NEXT:    vpcmpneqw 8(%ebp), %zmm2, %k1
     72 ; X86-NEXT:    kunpckwd %k0, %k1, %k1
     73 ; X86-NEXT:    vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
     74 ; X86-NEXT:    kmovd %k0, %eax
     75 ; X86-NEXT:    movl %ebp, %esp
     76 ; X86-NEXT:    popl %ebp
     77 ; X86-NEXT:    .cfi_def_cfa %esp, 4
     78 ; X86-NEXT:    vzeroupper
     79 ; X86-NEXT:    retl
     80 ;
     81 ; X64-LABEL: test_mm512_kunpackw:
     82 ; X64:       # %bb.0: # %entry
     83 ; X64-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
     84 ; X64-NEXT:    vpcmpneqw %zmm3, %zmm2, %k1
     85 ; X64-NEXT:    kunpckwd %k0, %k1, %k1
     86 ; X64-NEXT:    vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
     87 ; X64-NEXT:    kmovd %k0, %eax
     88 ; X64-NEXT:    vzeroupper
     89 ; X64-NEXT:    retq
     90 entry:
     91   %0 = bitcast <8 x i64> %__E to <32 x i16>
     92   %1 = bitcast <8 x i64> %__F to <32 x i16>
     93   %2 = bitcast <8 x i64> %__B to <32 x i16>
     94   %3 = bitcast <8 x i64> %__A to <32 x i16>
     95   %4 = icmp ne <32 x i16> %2, %3
     96   %5 = bitcast <8 x i64> %__C to <32 x i16>
     97   %6 = bitcast <8 x i64> %__D to <32 x i16>
     98   %7 = icmp ne <32 x i16> %5, %6
     99   %8 = shufflevector <32 x i1> %4, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    100   %9 = shufflevector <32 x i1> %7, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    101   %10 = shufflevector <16 x i1> %8, <16 x i1> %9, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    102   %11 = icmp ne <32 x i16> %0, %1
    103   %12 = and <32 x i1> %11, %10
    104   %13 = bitcast <32 x i1> %12 to i32
    105   ret i32 %13
    106 }
    107 
    108 
    109 define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A)  {
    110 ; X86-LABEL: test_mm512_mask_set1_epi8:
    111 ; X86:       # %bb.0: # %entry
    112 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
    113 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    114 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    115 ; X86-NEXT:    kunpckdq %k1, %k0, %k1
    116 ; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
    117 ; X86-NEXT:    retl
    118 ;
    119 ; X64-LABEL: test_mm512_mask_set1_epi8:
    120 ; X64:       # %bb.0: # %entry
    121 ; X64-NEXT:    kmovq %rdi, %k1
    122 ; X64-NEXT:    vpbroadcastb %esi, %zmm0 {%k1}
    123 ; X64-NEXT:    retq
    124   entry:
    125   %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
    126   %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
    127   %0 = bitcast <8 x i64> %__O to <64 x i8>
    128   %1 = bitcast i64 %__M to <64 x i1>
    129   %2 = select <64 x i1> %1, <64 x i8> %vecinit63.i.i, <64 x i8> %0
    130   %3 = bitcast <64 x i8> %2 to <8 x i64>
    131   ret <8 x i64> %3
    132 }
    133 
    134 define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
    135 ; X86-LABEL: test_mm512_maskz_set1_epi8:
    136 ; X86:       # %bb.0: # %entry
    137 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
    138 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    139 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    140 ; X86-NEXT:    kunpckdq %k1, %k0, %k1
    141 ; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} {z}
    142 ; X86-NEXT:    retl
    143 ;
    144 ; X64-LABEL: test_mm512_maskz_set1_epi8:
    145 ; X64:       # %bb.0: # %entry
    146 ; X64-NEXT:    kmovq %rdi, %k1
    147 ; X64-NEXT:    vpbroadcastb %esi, %zmm0 {%k1} {z}
    148 ; X64-NEXT:    retq
    149   entry:
    150   %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
    151   %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
    152   %0 = bitcast i64 %__M to <64 x i1>
    153   %1 = select <64 x i1> %0, <64 x i8> %vecinit63.i.i, <64 x i8> zeroinitializer
    154   %2 = bitcast <64 x i8> %1 to <8 x i64>
    155   ret <8 x i64> %2
    156 }
    157 
    158 define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A)  {
    159 ; X86-LABEL: test_mm512_mask_set1_epi16:
    160 ; X86:       # %bb.0: # %entry
    161 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
    162 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    163 ; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1}
    164 ; X86-NEXT:    retl
    165 ;
    166 ; X64-LABEL: test_mm512_mask_set1_epi16:
    167 ; X64:       # %bb.0: # %entry
    168 ; X64-NEXT:    kmovd %edi, %k1
    169 ; X64-NEXT:    vpbroadcastw %esi, %zmm0 {%k1}
    170 ; X64-NEXT:    retq
    171   entry:
    172   %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
    173   %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
    174   %0 = bitcast <8 x i64> %__O to <32 x i16>
    175   %1 = bitcast i32 %__M to <32 x i1>
    176   %2 = select <32 x i1> %1, <32 x i16> %vecinit31.i.i, <32 x i16> %0
    177   %3 = bitcast <32 x i16> %2 to <8 x i64>
    178   ret <8 x i64> %3
    179 }
    180 
    181 define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A)  {
    182 ; X86-LABEL: test_mm512_maskz_set1_epi16:
    183 ; X86:       # %bb.0: # %entry
    184 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
    185 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    186 ; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} {z}
    187 ; X86-NEXT:    retl
    188 ;
    189 ; X64-LABEL: test_mm512_maskz_set1_epi16:
    190 ; X64:       # %bb.0: # %entry
    191 ; X64-NEXT:    kmovd %edi, %k1
    192 ; X64-NEXT:    vpbroadcastw %esi, %zmm0 {%k1} {z}
    193 ; X64-NEXT:    retq
    194   entry:
    195   %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
    196   %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
    197   %0 = bitcast i32 %__M to <32 x i1>
    198   %1 = select <32 x i1> %0, <32 x i16> %vecinit31.i.i, <32 x i16> zeroinitializer
    199   %2 = bitcast <32 x i16> %1 to <8 x i64>
    200   ret <8 x i64> %2
    201 }
    202 
    203 define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
    204 ; CHECK-LABEL: test_mm512_broadcastb_epi8:
    205 ; CHECK:       # %bb.0:
    206 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0
    207 ; CHECK-NEXT:    ret{{[l|q]}}
    208   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    209   %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <64 x i32> zeroinitializer
    210   %res1 = bitcast <64 x i8> %res0 to <8 x i64>
    211   ret <8 x i64> %res1
    212 }
    213 
    214 define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, i64* %a1, <2 x i64> %a2) {
    215 ; X86-LABEL: test_mm512_mask_broadcastb_epi8:
    216 ; X86:       # %bb.0:
    217 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    218 ; X86-NEXT:    kmovq (%eax), %k1
    219 ; X86-NEXT:    vpbroadcastb %xmm1, %zmm0 {%k1}
    220 ; X86-NEXT:    retl
    221 ;
    222 ; X64-LABEL: test_mm512_mask_broadcastb_epi8:
    223 ; X64:       # %bb.0:
    224 ; X64-NEXT:    kmovq (%rdi), %k1
    225 ; X64-NEXT:    vpbroadcastb %xmm1, %zmm0 {%k1}
    226 ; X64-NEXT:    retq
    227   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
    228   %bc1 = bitcast i64* %a1 to <64 x i1>*
    229   %arg1 = load <64 x i1>, <64 x i1>* %bc1
    230   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
    231   %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <64 x i32> zeroinitializer
    232   %res1 = select <64 x i1> %arg1, <64 x i8> %res0, <64 x i8> %arg0
    233   %res2 = bitcast <64 x i8> %res1 to <8 x i64>
    234   ret <8 x i64> %res2
    235 }
    236 
    237 define <8 x i64> @test_mm512_maskz_broadcastb_epi8(i64* %a0, <2 x i64> %a1) {
    238 ; X86-LABEL: test_mm512_maskz_broadcastb_epi8:
    239 ; X86:       # %bb.0:
    240 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    241 ; X86-NEXT:    kmovq (%eax), %k1
    242 ; X86-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z}
    243 ; X86-NEXT:    retl
    244 ;
    245 ; X64-LABEL: test_mm512_maskz_broadcastb_epi8:
    246 ; X64:       # %bb.0:
    247 ; X64-NEXT:    kmovq (%rdi), %k1
    248 ; X64-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z}
    249 ; X64-NEXT:    retq
    250   %bc0 = bitcast i64* %a0 to <64 x i1>*
    251   %arg0 = load <64 x i1>, <64 x i1>* %bc0
    252   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
    253   %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <64 x i32> zeroinitializer
    254   %res1 = select <64 x i1> %arg0, <64 x i8> %res0, <64 x i8> zeroinitializer
    255   %res2 = bitcast <64 x i8> %res1 to <8 x i64>
    256   ret <8 x i64> %res2
    257 }
    258 
    259 define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) {
    260 ; CHECK-LABEL: test_mm512_broadcastw_epi16:
    261 ; CHECK:       # %bb.0:
    262 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
    263 ; CHECK-NEXT:    ret{{[l|q]}}
    264   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    265   %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <32 x i32> zeroinitializer
    266   %res1 = bitcast <32 x i16> %res0 to <8 x i64>
    267   ret <8 x i64> %res1
    268 }
    269 
    270 define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) {
    271 ; X86-LABEL: test_mm512_mask_broadcastw_epi16:
    272 ; X86:       # %bb.0:
    273 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    274 ; X86-NEXT:    vpbroadcastw %xmm1, %zmm0 {%k1}
    275 ; X86-NEXT:    retl
    276 ;
    277 ; X64-LABEL: test_mm512_mask_broadcastw_epi16:
    278 ; X64:       # %bb.0:
    279 ; X64-NEXT:    kmovd %edi, %k1
    280 ; X64-NEXT:    vpbroadcastw %xmm1, %zmm0 {%k1}
    281 ; X64-NEXT:    retq
    282   %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
    283   %arg1 = bitcast i32 %a1 to <32 x i1>
    284   %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
    285   %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <32 x i32> zeroinitializer
    286   %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
    287   %res2 = bitcast <32 x i16> %res1 to <8 x i64>
    288   ret <8 x i64> %res2
    289 }
    290 
    291 define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) {
    292 ; X86-LABEL: test_mm512_maskz_broadcastw_epi16:
    293 ; X86:       # %bb.0:
    294 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    295 ; X86-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z}
    296 ; X86-NEXT:    retl
    297 ;
    298 ; X64-LABEL: test_mm512_maskz_broadcastw_epi16:
    299 ; X64:       # %bb.0:
    300 ; X64-NEXT:    kmovd %edi, %k1
    301 ; X64-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z}
    302 ; X64-NEXT:    retq
    303   %arg0 = bitcast i32 %a0 to <32 x i1>
    304   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
    305   %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <32 x i32> zeroinitializer
    306   %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
    307   %res2 = bitcast <32 x i16> %res1 to <8 x i64>
    308   ret <8 x i64> %res2
    309 }
    310 
    311 define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) {
    312 ; CHECK-LABEL: test_mm512_bslli_epi128:
    313 ; CHECK:       # %bb.0:
    314 ; CHECK-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
    315 ; CHECK-NEXT:    ret{{[l|q]}}
    316   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
    317   %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
    318   %res1 = bitcast <64 x i8> %res0 to <8 x i64>
    319   ret <8 x i64> %res1
    320 }
    321 
    322 define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) {
    323 ; CHECK-LABEL: test_mm512_bsrli_epi128:
    324 ; CHECK:       # %bb.0:
    325 ; CHECK-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
    326 ; CHECK-NEXT:    ret{{[l|q]}}
    327   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
    328   %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
    329   %res1 = bitcast <64 x i8> %res0 to <8 x i64>
    330   ret <8 x i64> %res1
    331 }
    332 
    333 define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) {
    334 ; CHECK-LABEL: test_mm512_unpackhi_epi8:
    335 ; CHECK:       # %bb.0:
    336 ; CHECK-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
    337 ; CHECK-NEXT:    ret{{[l|q]}}
    338   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
    339   %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
    340   %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
    341   %res1 = bitcast <64 x i8> %res0 to <8 x i64>
    342   ret <8 x i64> %res1
    343 }
    344 
    345 ; TODO - improve support for i64 -> mmask64 on 32-bit targets
    346 define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
    347 ; X86-LABEL: test_mm512_mask_unpackhi_epi8:
    348 ; X86:       # %bb.0:
    349 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    350 ; X86-NEXT:    kmovq (%eax), %k1
    351 ; X86-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
    352 ; X86-NEXT:    retl
    353 ;
    354 ; X64-LABEL: test_mm512_mask_unpackhi_epi8:
    355 ; X64:       # %bb.0:
    356 ; X64-NEXT:    kmovq (%rdi), %k1
    357 ; X64-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
    358 ; X64-NEXT:    retq
    359   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
    360   %arg1 = bitcast i64* %a1 to <64 x i1>*
    361   %sel1 = load <64 x i1>, <64 x i1>* %arg1
    362   %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
    363   %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
    364   %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
    365   %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
    366   %res2 = bitcast <64 x i8> %res1 to <8 x i64>
    367   ret <8 x i64> %res2
    368 }
    369 
    370 define <8 x i64> @test_mm512_maskz_unpackhi_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
    371 ; X86-LABEL: test_mm512_maskz_unpackhi_epi8:
    372 ; X86:       # %bb.0:
    373 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    374 ; X86-NEXT:    kmovq (%eax), %k1
    375 ; X86-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
    376 ; X86-NEXT:    retl
    377 ;
    378 ; X64-LABEL: test_mm512_maskz_unpackhi_epi8:
    379 ; X64:       # %bb.0:
    380 ; X64-NEXT:    kmovq (%rdi), %k1
    381 ; X64-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
    382 ; X64-NEXT:    retq
    383   %arg0 = bitcast i64* %a0 to <64 x i1>*
    384   %sel0 = load <64 x i1>, <64 x i1>* %arg0
    385   %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
    386   %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
    387   %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
    388   %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
    389   %res2 = bitcast <64 x i8> %res1 to <8 x i64>
    390   ret <8 x i64> %res2
    391 }
    392 
    393 define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) {
    394 ; CHECK-LABEL: test_mm512_unpackhi_epi16:
    395 ; CHECK:       # %bb.0:
    396 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
    397 ; CHECK-NEXT:    ret{{[l|q]}}
    398   %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
    399   %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
    400   %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
    401   %res1 = bitcast <32 x i16> %res0 to <8 x i64>
    402   ret <8 x i64> %res1
    403 }
    404 
    405 define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
    406 ; X86-LABEL: test_mm512_mask_unpackhi_epi16:
    407 ; X86:       # %bb.0:
    408 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    409 ; X86-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
    410 ; X86-NEXT:    retl
    411 ;
    412 ; X64-LABEL: test_mm512_mask_unpackhi_epi16:
    413 ; X64:       # %bb.0:
    414 ; X64-NEXT:    kmovd %edi, %k1
    415 ; X64-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
    416 ; X64-NEXT:    retq
    417   %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
    418   %arg1 = bitcast i32 %a1 to <32 x i1>
    419   %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
    420   %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
    421   %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
    422   %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
    423   %res2 = bitcast <32 x i16> %res1 to <8 x i64>
    424   ret <8 x i64> %res2
    425 }
    426 
    427 define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
    428 ; X86-LABEL: test_mm512_maskz_unpackhi_epi16:
    429 ; X86:       # %bb.0:
    430 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    431 ; X86-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
    432 ; X86-NEXT:    retl
    433 ;
    434 ; X64-LABEL: test_mm512_maskz_unpackhi_epi16:
    435 ; X64:       # %bb.0:
    436 ; X64-NEXT:    kmovd %edi, %k1
    437 ; X64-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
    438 ; X64-NEXT:    retq
    439   %arg0 = bitcast i32 %a0 to <32 x i1>
    440   %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
    441   %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
    442   %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
    443   %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
    444   %res2 = bitcast <32 x i16> %res1 to <8 x i64>
    445   ret <8 x i64> %res2
    446 }
    447 
    448 define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) {
    449 ; CHECK-LABEL: test_mm512_unpacklo_epi8:
    450 ; CHECK:       # %bb.0:
    451 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
    452 ; CHECK-NEXT:    ret{{[l|q]}}
    453   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
    454   %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
    455   %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
    456   %res1 = bitcast <64 x i8> %res0 to <8 x i64>
    457   ret <8 x i64> %res1
    458 }
    459 
    460 define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
    461 ; X86-LABEL: test_mm512_mask_unpacklo_epi8:
    462 ; X86:       # %bb.0:
    463 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    464 ; X86-NEXT:    kmovq (%eax), %k1
    465 ; X86-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
    466 ; X86-NEXT:    retl
    467 ;
    468 ; X64-LABEL: test_mm512_mask_unpacklo_epi8:
    469 ; X64:       # %bb.0:
    470 ; X64-NEXT:    kmovq (%rdi), %k1
    471 ; X64-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
    472 ; X64-NEXT:    retq
    473   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
    474   %arg1 = bitcast i64* %a1 to <64 x i1>*
    475   %sel1 = load <64 x i1>, <64 x i1>* %arg1
    476   %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
    477   %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
    478   %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
    479   %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
    480   %res2 = bitcast <64 x i8> %res1 to <8 x i64>
    481   ret <8 x i64> %res2
    482 }
    483 
    484 define <8 x i64> @test_mm512_maskz_unpacklo_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
    485 ; X86-LABEL: test_mm512_maskz_unpacklo_epi8:
    486 ; X86:       # %bb.0:
    487 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    488 ; X86-NEXT:    kmovq (%eax), %k1
    489 ; X86-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
    490 ; X86-NEXT:    retl
    491 ;
    492 ; X64-LABEL: test_mm512_maskz_unpacklo_epi8:
    493 ; X64:       # %bb.0:
    494 ; X64-NEXT:    kmovq (%rdi), %k1
    495 ; X64-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
    496 ; X64-NEXT:    retq
    497   %arg0 = bitcast i64* %a0 to <64 x i1>*
    498   %sel0 = load <64 x i1>, <64 x i1>* %arg0
    499   %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
    500   %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
    501   %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
    502   %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
    503   %res2 = bitcast <64 x i8> %res1 to <8 x i64>
    504   ret <8 x i64> %res2
    505 }
    506 
    507 define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) {
    508 ; CHECK-LABEL: test_mm512_unpacklo_epi16:
    509 ; CHECK:       # %bb.0:
    510 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
    511 ; CHECK-NEXT:    ret{{[l|q]}}
    512   %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
    513   %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
    514   %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
    515   %res1 = bitcast <32 x i16> %res0 to <8 x i64>
    516   ret <8 x i64> %res1
    517 }
    518 
    519 define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
    520 ; X86-LABEL: test_mm512_mask_unpacklo_epi16:
    521 ; X86:       # %bb.0:
    522 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    523 ; X86-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
    524 ; X86-NEXT:    retl
    525 ;
    526 ; X64-LABEL: test_mm512_mask_unpacklo_epi16:
    527 ; X64:       # %bb.0:
    528 ; X64-NEXT:    kmovd %edi, %k1
    529 ; X64-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
    530 ; X64-NEXT:    retq
    531   %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
    532   %arg1 = bitcast i32 %a1 to <32 x i1>
    533   %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
    534   %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
    535   %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
    536   %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
    537   %res2 = bitcast <32 x i16> %res1 to <8 x i64>
    538   ret <8 x i64> %res2
    539 }
    540 
    541 define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
    542 ; X86-LABEL: test_mm512_maskz_unpacklo_epi16:
    543 ; X86:       # %bb.0:
    544 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    545 ; X86-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
    546 ; X86-NEXT:    retl
    547 ;
    548 ; X64-LABEL: test_mm512_maskz_unpacklo_epi16:
    549 ; X64:       # %bb.0:
    550 ; X64-NEXT:    kmovd %edi, %k1
    551 ; X64-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
    552 ; X64-NEXT:    retq
    553   %arg0 = bitcast i32 %a0 to <32 x i1>
    554   %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
    555   %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
    556   %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
    557   %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
    558   %res2 = bitcast <32 x i16> %res1 to <8 x i64>
    559   ret <8 x i64> %res2
    560 }
    561 
    562 define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
    563 ; X86-LABEL: test_mm512_test_epi8_mask:
    564 ; X86:       # %bb.0: # %entry
    565 ; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
    566 ; X86-NEXT:    kshiftrq $32, %k0, %k1
    567 ; X86-NEXT:    kmovd %k0, %eax
    568 ; X86-NEXT:    kmovd %k1, %edx
    569 ; X86-NEXT:    vzeroupper
    570 ; X86-NEXT:    retl
    571 ;
    572 ; X64-LABEL: test_mm512_test_epi8_mask:
    573 ; X64:       # %bb.0: # %entry
    574 ; X64-NEXT:    vptestmb %zmm0, %zmm1, %k0
    575 ; X64-NEXT:    kmovq %k0, %rax
    576 ; X64-NEXT:    vzeroupper
    577 ; X64-NEXT:    retq
    578 entry:
    579   %and1.i.i = and <8 x i64> %__B, %__A
    580   %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
    581   %1 = icmp ne <64 x i8> %0, zeroinitializer
    582   %2 = bitcast <64 x i1> %1 to i64
    583   ret i64 %2
    584 }
    585 
    586 define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
    587 ; X86-LABEL: test_mm512_mask_test_epi8_mask:
    588 ; X86:       # %bb.0: # %entry
    589 ; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
    590 ; X86-NEXT:    kshiftrq $32, %k0, %k1
    591 ; X86-NEXT:    kmovd %k1, %edx
    592 ; X86-NEXT:    kmovd %k0, %eax
    593 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
    594 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
    595 ; X86-NEXT:    vzeroupper
    596 ; X86-NEXT:    retl
    597 ;
    598 ; X64-LABEL: test_mm512_mask_test_epi8_mask:
    599 ; X64:       # %bb.0: # %entry
    600 ; X64-NEXT:    kmovq %rdi, %k1
    601 ; X64-NEXT:    vptestmb %zmm0, %zmm1, %k0 {%k1}
    602 ; X64-NEXT:    kmovq %k0, %rax
    603 ; X64-NEXT:    vzeroupper
    604 ; X64-NEXT:    retq
    605 entry:
    606   %and1.i.i = and <8 x i64> %__B, %__A
    607   %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
    608   %1 = icmp ne <64 x i8> %0, zeroinitializer
    609   %2 = bitcast i64 %__U to <64 x i1>
    610   %3 = and <64 x i1> %1, %2
    611   %4 = bitcast <64 x i1> %3 to i64
    612   ret i64 %4
    613 }
    614 
    615 define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
    616 ; CHECK-LABEL: test_mm512_test_epi16_mask:
    617 ; CHECK:       # %bb.0: # %entry
    618 ; CHECK-NEXT:    vptestmw %zmm0, %zmm1, %k0
    619 ; CHECK-NEXT:    kmovd %k0, %eax
    620 ; CHECK-NEXT:    vzeroupper
    621 ; CHECK-NEXT:    ret{{[l|q]}}
    622 entry:
    623   %and1.i.i = and <8 x i64> %__B, %__A
    624   %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
    625   %1 = icmp ne <32 x i16> %0, zeroinitializer
    626   %2 = bitcast <32 x i1> %1 to i32
    627   ret i32 %2
    628 }
    629 
    630 define i32 @test_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
    631 ; X86-LABEL: test_mm512_mask_test_epi16_mask:
    632 ; X86:       # %bb.0: # %entry
    633 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    634 ; X86-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
    635 ; X86-NEXT:    kmovd %k0, %eax
    636 ; X86-NEXT:    vzeroupper
    637 ; X86-NEXT:    retl
    638 ;
    639 ; X64-LABEL: test_mm512_mask_test_epi16_mask:
    640 ; X64:       # %bb.0: # %entry
    641 ; X64-NEXT:    kmovd %edi, %k1
    642 ; X64-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
    643 ; X64-NEXT:    kmovd %k0, %eax
    644 ; X64-NEXT:    vzeroupper
    645 ; X64-NEXT:    retq
    646 entry:
    647   %and1.i.i = and <8 x i64> %__B, %__A
    648   %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
    649   %1 = icmp ne <32 x i16> %0, zeroinitializer
    650   %2 = bitcast i32 %__U to <32 x i1>
    651   %3 = and <32 x i1> %1, %2
    652   %4 = bitcast <32 x i1> %3 to i32
    653   ret i32 %4
    654 }
    655 
    656 define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
    657 ; X86-LABEL: test_mm512_testn_epi8_mask:
    658 ; X86:       # %bb.0: # %entry
    659 ; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
    660 ; X86-NEXT:    kshiftrq $32, %k0, %k1
    661 ; X86-NEXT:    kmovd %k0, %eax
    662 ; X86-NEXT:    kmovd %k1, %edx
    663 ; X86-NEXT:    vzeroupper
    664 ; X86-NEXT:    retl
    665 ;
    666 ; X64-LABEL: test_mm512_testn_epi8_mask:
    667 ; X64:       # %bb.0: # %entry
    668 ; X64-NEXT:    vptestnmb %zmm0, %zmm1, %k0
    669 ; X64-NEXT:    kmovq %k0, %rax
    670 ; X64-NEXT:    vzeroupper
    671 ; X64-NEXT:    retq
    672 entry:
    673   %and1.i.i = and <8 x i64> %__B, %__A
    674   %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
    675   %1 = icmp eq <64 x i8> %0, zeroinitializer
    676   %2 = bitcast <64 x i1> %1 to i64
    677   ret i64 %2
    678 }
    679 
    680 define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
    681 ; X86-LABEL: test_mm512_mask_testn_epi8_mask:
    682 ; X86:       # %bb.0: # %entry
    683 ; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
    684 ; X86-NEXT:    kshiftrq $32, %k0, %k1
    685 ; X86-NEXT:    kmovd %k1, %edx
    686 ; X86-NEXT:    kmovd %k0, %eax
    687 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
    688 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
    689 ; X86-NEXT:    vzeroupper
    690 ; X86-NEXT:    retl
    691 ;
    692 ; X64-LABEL: test_mm512_mask_testn_epi8_mask:
    693 ; X64:       # %bb.0: # %entry
    694 ; X64-NEXT:    kmovq %rdi, %k1
    695 ; X64-NEXT:    vptestnmb %zmm0, %zmm1, %k0 {%k1}
    696 ; X64-NEXT:    kmovq %k0, %rax
    697 ; X64-NEXT:    vzeroupper
    698 ; X64-NEXT:    retq
    699 entry:
    700   %and1.i.i = and <8 x i64> %__B, %__A
    701   %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
    702   %1 = icmp eq <64 x i8> %0, zeroinitializer
    703   %2 = bitcast i64 %__U to <64 x i1>
    704   %3 = and <64 x i1> %1, %2
    705   %4 = bitcast <64 x i1> %3 to i64
    706   ret i64 %4
    707 }
    708 
    709 define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
    710 ; CHECK-LABEL: test_mm512_testn_epi16_mask:
    711 ; CHECK:       # %bb.0: # %entry
    712 ; CHECK-NEXT:    vptestnmw %zmm0, %zmm1, %k0
    713 ; CHECK-NEXT:    kmovd %k0, %eax
    714 ; CHECK-NEXT:    vzeroupper
    715 ; CHECK-NEXT:    ret{{[l|q]}}
    716 entry:
    717   %and1.i.i = and <8 x i64> %__B, %__A
    718   %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
    719   %1 = icmp eq <32 x i16> %0, zeroinitializer
    720   %2 = bitcast <32 x i1> %1 to i32
    721   ret i32 %2
    722 }
    723 
    724 define i32 @test_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
    725 ; X86-LABEL: test_mm512_mask_testn_epi16_mask:
    726 ; X86:       # %bb.0: # %entry
    727 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    728 ; X86-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
    729 ; X86-NEXT:    kmovd %k0, %eax
    730 ; X86-NEXT:    vzeroupper
    731 ; X86-NEXT:    retl
    732 ;
    733 ; X64-LABEL: test_mm512_mask_testn_epi16_mask:
    734 ; X64:       # %bb.0: # %entry
    735 ; X64-NEXT:    kmovd %edi, %k1
    736 ; X64-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
    737 ; X64-NEXT:    kmovd %k0, %eax
    738 ; X64-NEXT:    vzeroupper
    739 ; X64-NEXT:    retq
    740 entry:
    741   %and1.i.i = and <8 x i64> %__B, %__A
    742   %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
    743   %1 = icmp eq <32 x i16> %0, zeroinitializer
    744   %2 = bitcast i32 %__U to <32 x i1>
    745   %3 = and <32 x i1> %1, %2
    746   %4 = bitcast <32 x i1> %3 to i32
    747   ret i32 %4
    748 }
    749 
    750 define <4 x i64> @test_mm512_cvtepi16_epi8(<8 x i64> %__A) {
    751 ; CHECK-LABEL: test_mm512_cvtepi16_epi8:
    752 ; CHECK:       # %bb.0: # %entry
    753 ; CHECK-NEXT:    vpmovwb %zmm0, %ymm0
    754 ; CHECK-NEXT:    ret{{[l|q]}}
    755 entry:
    756   %0 = bitcast <8 x i64> %__A to <32 x i16>
    757   %conv.i = trunc <32 x i16> %0 to <32 x i8>
    758   %1 = bitcast <32 x i8> %conv.i to <4 x i64>
    759   ret <4 x i64> %1
    760 }
    761 
    762 define <4 x i64> @test_mm512_mask_cvtepi16_epi8(<4 x i64> %__O, i32 %__M, <8 x i64> %__A) {
    763 ; X86-LABEL: test_mm512_mask_cvtepi16_epi8:
    764 ; X86:       # %bb.0: # %entry
    765 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    766 ; X86-NEXT:    vpmovwb %zmm1, %ymm0 {%k1}
    767 ; X86-NEXT:    retl
    768 ;
    769 ; X64-LABEL: test_mm512_mask_cvtepi16_epi8:
    770 ; X64:       # %bb.0: # %entry
    771 ; X64-NEXT:    kmovd %edi, %k1
    772 ; X64-NEXT:    vpmovwb %zmm1, %ymm0 {%k1}
    773 ; X64-NEXT:    retq
    774 entry:
    775   %0 = bitcast <8 x i64> %__A to <32 x i16>
    776   %conv.i.i = trunc <32 x i16> %0 to <32 x i8>
    777   %1 = bitcast <4 x i64> %__O to <32 x i8>
    778   %2 = bitcast i32 %__M to <32 x i1>
    779   %3 = select <32 x i1> %2, <32 x i8> %conv.i.i, <32 x i8> %1
    780   %4 = bitcast <32 x i8> %3 to <4 x i64>
    781   ret <4 x i64> %4
    782 }
    783 
    784 define <4 x i64> @test_mm512_maskz_cvtepi16_epi8(i32 %__M, <8 x i64> %__A) {
    785 ; X86-LABEL: test_mm512_maskz_cvtepi16_epi8:
    786 ; X86:       # %bb.0: # %entry
    787 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    788 ; X86-NEXT:    vpmovwb %zmm0, %ymm0 {%k1} {z}
    789 ; X86-NEXT:    retl
    790 ;
    791 ; X64-LABEL: test_mm512_maskz_cvtepi16_epi8:
    792 ; X64:       # %bb.0: # %entry
    793 ; X64-NEXT:    kmovd %edi, %k1
    794 ; X64-NEXT:    vpmovwb %zmm0, %ymm0 {%k1} {z}
    795 ; X64-NEXT:    retq
    796 entry:
    797   %0 = bitcast <8 x i64> %__A to <32 x i16>
    798   %conv.i.i = trunc <32 x i16> %0 to <32 x i8>
    799   %1 = bitcast i32 %__M to <32 x i1>
    800   %2 = select <32 x i1> %1, <32 x i8> %conv.i.i, <32 x i8> zeroinitializer
    801   %3 = bitcast <32 x i8> %2 to <4 x i64>
    802   ret <4 x i64> %3
    803 }
    804 
    805 define <8 x i64> @test_mm512_mask2_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, i32 %__U, <8 x i64> %__B) {
    806 ; X86-LABEL: test_mm512_mask2_permutex2var_epi16:
    807 ; X86:       # %bb.0: # %entry
    808 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    809 ; X86-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
    810 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
    811 ; X86-NEXT:    retl
    812 ;
    813 ; X64-LABEL: test_mm512_mask2_permutex2var_epi16:
    814 ; X64:       # %bb.0: # %entry
    815 ; X64-NEXT:    kmovd %edi, %k1
    816 ; X64-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
    817 ; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
    818 ; X64-NEXT:    retq
    819 entry:
    820   %0 = bitcast <8 x i64> %__A to <32 x i16>
    821   %1 = bitcast <8 x i64> %__I to <32 x i16>
    822   %2 = bitcast <8 x i64> %__B to <32 x i16>
    823   %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
    824   %4 = bitcast i32 %__U to <32 x i1>
    825   %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %1
    826   %6 = bitcast <32 x i16> %5 to <8 x i64>
    827   ret <8 x i64> %6
    828 }
    829 
    830 define <8 x i64> @test_mm512_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
    831 ; CHECK-LABEL: test_mm512_permutex2var_epi16:
    832 ; CHECK:       # %bb.0: # %entry
    833 ; CHECK-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0
    834 ; CHECK-NEXT:    ret{{[l|q]}}
    835 entry:
    836   %0 = bitcast <8 x i64> %__A to <32 x i16>
    837   %1 = bitcast <8 x i64> %__I to <32 x i16>
    838   %2 = bitcast <8 x i64> %__B to <32 x i16>
    839   %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
    840   %4 = bitcast <32 x i16> %3 to <8 x i64>
    841   ret <8 x i64> %4
    842 }
    843 
    844 define <8 x i64> @test_mm512_mask_permutex2var_epi16(<8 x i64> %__A, i32 %__U, <8 x i64> %__I, <8 x i64> %__B) {
    845 ; X86-LABEL: test_mm512_mask_permutex2var_epi16:
    846 ; X86:       # %bb.0: # %entry
    847 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    848 ; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1}
    849 ; X86-NEXT:    retl
    850 ;
    851 ; X64-LABEL: test_mm512_mask_permutex2var_epi16:
    852 ; X64:       # %bb.0: # %entry
    853 ; X64-NEXT:    kmovd %edi, %k1
    854 ; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1}
    855 ; X64-NEXT:    retq
    856 entry:
    857   %0 = bitcast <8 x i64> %__A to <32 x i16>
    858   %1 = bitcast <8 x i64> %__I to <32 x i16>
    859   %2 = bitcast <8 x i64> %__B to <32 x i16>
    860   %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
    861   %4 = bitcast i32 %__U to <32 x i1>
    862   %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %0
    863   %6 = bitcast <32 x i16> %5 to <8 x i64>
    864   ret <8 x i64> %6
    865 }
    866 
    867 define <8 x i64> @test_mm512_maskz_permutex2var_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
    868 ; X86-LABEL: test_mm512_maskz_permutex2var_epi16:
    869 ; X86:       # %bb.0: # %entry
    870 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
    871 ; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z}
    872 ; X86-NEXT:    retl
    873 ;
    874 ; X64-LABEL: test_mm512_maskz_permutex2var_epi16:
    875 ; X64:       # %bb.0: # %entry
    876 ; X64-NEXT:    kmovd %edi, %k1
    877 ; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z}
    878 ; X64-NEXT:    retq
    879 entry:
    880   %0 = bitcast <8 x i64> %__A to <32 x i16>
    881   %1 = bitcast <8 x i64> %__I to <32 x i16>
    882   %2 = bitcast <8 x i64> %__B to <32 x i16>
    883   %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
    884   %4 = bitcast i32 %__U to <32 x i1>
    885   %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
    886   %6 = bitcast <32 x i16> %5 to <8 x i64>
    887   ret <8 x i64> %6
    888 }
    889 
    890 declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
    891 
    892 !0 = !{i32 1}
    893 
    894