Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
      3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
      4 
      5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
      6 
      7 define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
      8 ; X86-LABEL: test_mm_mask_cvtepi32_ps:
      9 ; X86:       # %bb.0: # %entry
     10 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
     11 ; X86-NEXT:    kmovw %eax, %k1
     12 ; X86-NEXT:    vcvtdq2ps %xmm1, %xmm0 {%k1}
     13 ; X86-NEXT:    retl
     14 ;
     15 ; X64-LABEL: test_mm_mask_cvtepi32_ps:
     16 ; X64:       # %bb.0: # %entry
     17 ; X64-NEXT:    kmovw %edi, %k1
     18 ; X64-NEXT:    vcvtdq2ps %xmm1, %xmm0 {%k1}
     19 ; X64-NEXT:    retq
     20 entry:
     21   %0 = bitcast <2 x i64> %__A to <4 x i32>
     22   %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
     23   %1 = bitcast i8 %__U to <8 x i1>
     24   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
     25   %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
     26   ret <4 x float> %2
     27 }
     28 
     29 define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) {
     30 ; X86-LABEL: test_mm_maskz_cvtepi32_ps:
     31 ; X86:       # %bb.0: # %entry
     32 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
     33 ; X86-NEXT:    kmovw %eax, %k1
     34 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
     35 ; X86-NEXT:    retl
     36 ;
     37 ; X64-LABEL: test_mm_maskz_cvtepi32_ps:
     38 ; X64:       # %bb.0: # %entry
     39 ; X64-NEXT:    kmovw %edi, %k1
     40 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
     41 ; X64-NEXT:    retq
     42 entry:
     43   %0 = bitcast <2 x i64> %__A to <4 x i32>
     44   %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
     45   %1 = bitcast i8 %__U to <8 x i1>
     46   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
     47   %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
     48   ret <4 x float> %2
     49 }
     50 
     51 define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
     52 ; X86-LABEL: test_mm256_mask_cvtepi32_ps:
     53 ; X86:       # %bb.0: # %entry
     54 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
     55 ; X86-NEXT:    kmovw %eax, %k1
     56 ; X86-NEXT:    vcvtdq2ps %ymm1, %ymm0 {%k1}
     57 ; X86-NEXT:    retl
     58 ;
     59 ; X64-LABEL: test_mm256_mask_cvtepi32_ps:
     60 ; X64:       # %bb.0: # %entry
     61 ; X64-NEXT:    kmovw %edi, %k1
     62 ; X64-NEXT:    vcvtdq2ps %ymm1, %ymm0 {%k1}
     63 ; X64-NEXT:    retq
     64 entry:
     65   %0 = bitcast <4 x i64> %__A to <8 x i32>
     66   %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
     67   %1 = bitcast i8 %__U to <8 x i1>
     68   %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
     69   ret <8 x float> %2
     70 }
     71 
     72 define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) {
     73 ; X86-LABEL: test_mm256_maskz_cvtepi32_ps:
     74 ; X86:       # %bb.0: # %entry
     75 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
     76 ; X86-NEXT:    kmovw %eax, %k1
     77 ; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
     78 ; X86-NEXT:    retl
     79 ;
     80 ; X64-LABEL: test_mm256_maskz_cvtepi32_ps:
     81 ; X64:       # %bb.0: # %entry
     82 ; X64-NEXT:    kmovw %edi, %k1
     83 ; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
     84 ; X64-NEXT:    retq
     85 entry:
     86   %0 = bitcast <4 x i64> %__A to <8 x i32>
     87   %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
     88   %1 = bitcast i8 %__U to <8 x i1>
     89   %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
     90   ret <8 x float> %2
     91 }
     92 
     93 define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
     94 ; X86-LABEL: test_mm_mask_cvtpd_epi32:
     95 ; X86:       # %bb.0: # %entry
     96 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
     97 ; X86-NEXT:    kmovw %eax, %k1
     98 ; X86-NEXT:    vcvtpd2dq %xmm1, %xmm0 {%k1}
     99 ; X86-NEXT:    retl
    100 ;
    101 ; X64-LABEL: test_mm_mask_cvtpd_epi32:
    102 ; X64:       # %bb.0: # %entry
    103 ; X64-NEXT:    kmovw %edi, %k1
    104 ; X64-NEXT:    vcvtpd2dq %xmm1, %xmm0 {%k1}
    105 ; X64-NEXT:    retq
    106 entry:
    107   %0 = bitcast <2 x i64> %__W to <4 x i32>
    108   %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
    109   %2 = bitcast <4 x i32> %1 to <2 x i64>
    110   ret <2 x i64> %2
    111 }
    112 
    113 define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
    114 ; X86-LABEL: test_mm_maskz_cvtpd_epi32:
    115 ; X86:       # %bb.0: # %entry
    116 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    117 ; X86-NEXT:    kmovw %eax, %k1
    118 ; X86-NEXT:    vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
    119 ; X86-NEXT:    retl
    120 ;
    121 ; X64-LABEL: test_mm_maskz_cvtpd_epi32:
    122 ; X64:       # %bb.0: # %entry
    123 ; X64-NEXT:    kmovw %edi, %k1
    124 ; X64-NEXT:    vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
    125 ; X64-NEXT:    retq
    126 entry:
    127   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
    128   %1 = bitcast <4 x i32> %0 to <2 x i64>
    129   ret <2 x i64> %1
    130 }
    131 
    132 define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
    133 ; X86-LABEL: test_mm256_mask_cvtpd_epi32:
    134 ; X86:       # %bb.0: # %entry
    135 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    136 ; X86-NEXT:    kmovw %eax, %k1
    137 ; X86-NEXT:    vcvtpd2dq %ymm1, %xmm0 {%k1}
    138 ; X86-NEXT:    vzeroupper
    139 ; X86-NEXT:    retl
    140 ;
    141 ; X64-LABEL: test_mm256_mask_cvtpd_epi32:
    142 ; X64:       # %bb.0: # %entry
    143 ; X64-NEXT:    kmovw %edi, %k1
    144 ; X64-NEXT:    vcvtpd2dq %ymm1, %xmm0 {%k1}
    145 ; X64-NEXT:    vzeroupper
    146 ; X64-NEXT:    retq
    147 entry:
    148   %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
    149   %1 = bitcast <2 x i64> %__W to <4 x i32>
    150   %2 = bitcast i8 %__U to <8 x i1>
    151   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    152   %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
    153   %4 = bitcast <4 x i32> %3 to <2 x i64>
    154   ret <2 x i64> %4
    155 }
    156 
    157 define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
    158 ; X86-LABEL: test_mm256_maskz_cvtpd_epi32:
    159 ; X86:       # %bb.0: # %entry
    160 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    161 ; X86-NEXT:    kmovw %eax, %k1
    162 ; X86-NEXT:    vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
    163 ; X86-NEXT:    vzeroupper
    164 ; X86-NEXT:    retl
    165 ;
    166 ; X64-LABEL: test_mm256_maskz_cvtpd_epi32:
    167 ; X64:       # %bb.0: # %entry
    168 ; X64-NEXT:    kmovw %edi, %k1
    169 ; X64-NEXT:    vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
    170 ; X64-NEXT:    vzeroupper
    171 ; X64-NEXT:    retq
    172 entry:
    173   %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
    174   %1 = bitcast i8 %__U to <8 x i1>
    175   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    176   %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
    177   %3 = bitcast <4 x i32> %2 to <2 x i64>
    178   ret <2 x i64> %3
    179 }
    180 
    181 define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) {
    182 ; X86-LABEL: test_mm_mask_cvtpd_ps:
    183 ; X86:       # %bb.0: # %entry
    184 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    185 ; X86-NEXT:    kmovw %eax, %k1
    186 ; X86-NEXT:    vcvtpd2ps %xmm1, %xmm0 {%k1}
    187 ; X86-NEXT:    retl
    188 ;
    189 ; X64-LABEL: test_mm_mask_cvtpd_ps:
    190 ; X64:       # %bb.0: # %entry
    191 ; X64-NEXT:    kmovw %edi, %k1
    192 ; X64-NEXT:    vcvtpd2ps %xmm1, %xmm0 {%k1}
    193 ; X64-NEXT:    retq
    194 entry:
    195   %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8
    196   ret <4 x float> %0
    197 }
    198 
    199 define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) {
    200 ; X86-LABEL: test_mm_maskz_cvtpd_ps:
    201 ; X86:       # %bb.0: # %entry
    202 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    203 ; X86-NEXT:    kmovw %eax, %k1
    204 ; X86-NEXT:    vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
    205 ; X86-NEXT:    retl
    206 ;
    207 ; X64-LABEL: test_mm_maskz_cvtpd_ps:
    208 ; X64:       # %bb.0: # %entry
    209 ; X64-NEXT:    kmovw %edi, %k1
    210 ; X64-NEXT:    vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
    211 ; X64-NEXT:    retq
    212 entry:
    213   %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8
    214   ret <4 x float> %0
    215 }
    216 
    217 define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) {
    218 ; X86-LABEL: test_mm256_mask_cvtpd_ps:
    219 ; X86:       # %bb.0: # %entry
    220 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    221 ; X86-NEXT:    kmovw %eax, %k1
    222 ; X86-NEXT:    vcvtpd2ps %ymm1, %xmm0 {%k1}
    223 ; X86-NEXT:    vzeroupper
    224 ; X86-NEXT:    retl
    225 ;
    226 ; X64-LABEL: test_mm256_mask_cvtpd_ps:
    227 ; X64:       # %bb.0: # %entry
    228 ; X64-NEXT:    kmovw %edi, %k1
    229 ; X64-NEXT:    vcvtpd2ps %ymm1, %xmm0 {%k1}
    230 ; X64-NEXT:    vzeroupper
    231 ; X64-NEXT:    retq
    232 entry:
    233   %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
    234   %1 = bitcast i8 %__U to <8 x i1>
    235   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    236   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
    237   ret <4 x float> %2
    238 }
    239 
    240 define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) {
    241 ; X86-LABEL: test_mm256_maskz_cvtpd_ps:
    242 ; X86:       # %bb.0: # %entry
    243 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    244 ; X86-NEXT:    kmovw %eax, %k1
    245 ; X86-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
    246 ; X86-NEXT:    vzeroupper
    247 ; X86-NEXT:    retl
    248 ;
    249 ; X64-LABEL: test_mm256_maskz_cvtpd_ps:
    250 ; X64:       # %bb.0: # %entry
    251 ; X64-NEXT:    kmovw %edi, %k1
    252 ; X64-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
    253 ; X64-NEXT:    vzeroupper
    254 ; X64-NEXT:    retq
    255 entry:
    256   %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
    257   %1 = bitcast i8 %__U to <8 x i1>
    258   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    259   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
    260   ret <4 x float> %2
    261 }
    262 
    263 define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) {
    264 ; CHECK-LABEL: test_mm_cvtpd_epu32:
    265 ; CHECK:       # %bb.0: # %entry
    266 ; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm0
    267 ; CHECK-NEXT:    ret{{[l|q]}}
    268 entry:
    269   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
    270   %1 = bitcast <4 x i32> %0 to <2 x i64>
    271   ret <2 x i64> %1
    272 }
    273 
    274 define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
    275 ; X86-LABEL: test_mm_mask_cvtpd_epu32:
    276 ; X86:       # %bb.0: # %entry
    277 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    278 ; X86-NEXT:    kmovw %eax, %k1
    279 ; X86-NEXT:    vcvtpd2udq %xmm1, %xmm0 {%k1}
    280 ; X86-NEXT:    retl
    281 ;
    282 ; X64-LABEL: test_mm_mask_cvtpd_epu32:
    283 ; X64:       # %bb.0: # %entry
    284 ; X64-NEXT:    kmovw %edi, %k1
    285 ; X64-NEXT:    vcvtpd2udq %xmm1, %xmm0 {%k1}
    286 ; X64-NEXT:    retq
    287 entry:
    288   %0 = bitcast <2 x i64> %__W to <4 x i32>
    289   %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
    290   %2 = bitcast <4 x i32> %1 to <2 x i64>
    291   ret <2 x i64> %2
    292 }
    293 
    294 define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
    295 ; X86-LABEL: test_mm_maskz_cvtpd_epu32:
    296 ; X86:       # %bb.0: # %entry
    297 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    298 ; X86-NEXT:    kmovw %eax, %k1
    299 ; X86-NEXT:    vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
    300 ; X86-NEXT:    retl
    301 ;
    302 ; X64-LABEL: test_mm_maskz_cvtpd_epu32:
    303 ; X64:       # %bb.0: # %entry
    304 ; X64-NEXT:    kmovw %edi, %k1
    305 ; X64-NEXT:    vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
    306 ; X64-NEXT:    retq
    307 entry:
    308   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
    309   %1 = bitcast <4 x i32> %0 to <2 x i64>
    310   ret <2 x i64> %1
    311 }
    312 
    313 define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) {
    314 ; CHECK-LABEL: test_mm256_cvtpd_epu32:
    315 ; CHECK:       # %bb.0: # %entry
    316 ; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm0
    317 ; CHECK-NEXT:    vzeroupper
    318 ; CHECK-NEXT:    ret{{[l|q]}}
    319 entry:
    320   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
    321   %1 = bitcast <4 x i32> %0 to <2 x i64>
    322   ret <2 x i64> %1
    323 }
    324 
    325 define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
    326 ; X86-LABEL: test_mm256_mask_cvtpd_epu32:
    327 ; X86:       # %bb.0: # %entry
    328 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    329 ; X86-NEXT:    kmovw %eax, %k1
    330 ; X86-NEXT:    vcvtpd2udq %ymm1, %xmm0 {%k1}
    331 ; X86-NEXT:    vzeroupper
    332 ; X86-NEXT:    retl
    333 ;
    334 ; X64-LABEL: test_mm256_mask_cvtpd_epu32:
    335 ; X64:       # %bb.0: # %entry
    336 ; X64-NEXT:    kmovw %edi, %k1
    337 ; X64-NEXT:    vcvtpd2udq %ymm1, %xmm0 {%k1}
    338 ; X64-NEXT:    vzeroupper
    339 ; X64-NEXT:    retq
    340 entry:
    341   %0 = bitcast <2 x i64> %__W to <4 x i32>
    342   %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
    343   %2 = bitcast <4 x i32> %1 to <2 x i64>
    344   ret <2 x i64> %2
    345 }
    346 
    347 define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
    348 ; X86-LABEL: test_mm256_maskz_cvtpd_epu32:
    349 ; X86:       # %bb.0: # %entry
    350 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    351 ; X86-NEXT:    kmovw %eax, %k1
    352 ; X86-NEXT:    vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
    353 ; X86-NEXT:    vzeroupper
    354 ; X86-NEXT:    retl
    355 ;
    356 ; X64-LABEL: test_mm256_maskz_cvtpd_epu32:
    357 ; X64:       # %bb.0: # %entry
    358 ; X64-NEXT:    kmovw %edi, %k1
    359 ; X64-NEXT:    vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
    360 ; X64-NEXT:    vzeroupper
    361 ; X64-NEXT:    retq
    362 entry:
    363   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
    364   %1 = bitcast <4 x i32> %0 to <2 x i64>
    365   ret <2 x i64> %1
    366 }
    367 
    368 define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
    369 ; X86-LABEL: test_mm_mask_cvtps_epi32:
    370 ; X86:       # %bb.0: # %entry
    371 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    372 ; X86-NEXT:    kmovw %eax, %k1
    373 ; X86-NEXT:    vcvtps2dq %xmm1, %xmm0 {%k1}
    374 ; X86-NEXT:    retl
    375 ;
    376 ; X64-LABEL: test_mm_mask_cvtps_epi32:
    377 ; X64:       # %bb.0: # %entry
    378 ; X64-NEXT:    kmovw %edi, %k1
    379 ; X64-NEXT:    vcvtps2dq %xmm1, %xmm0 {%k1}
    380 ; X64-NEXT:    retq
    381 entry:
    382   %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
    383   %1 = bitcast <2 x i64> %__W to <4 x i32>
    384   %2 = bitcast i8 %__U to <8 x i1>
    385   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    386   %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
    387   %4 = bitcast <4 x i32> %3 to <2 x i64>
    388   ret <2 x i64> %4
    389 }
    390 
    391 define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) {
    392 ; X86-LABEL: test_mm_maskz_cvtps_epi32:
    393 ; X86:       # %bb.0: # %entry
    394 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    395 ; X86-NEXT:    kmovw %eax, %k1
    396 ; X86-NEXT:    vcvtps2dq %xmm0, %xmm0 {%k1} {z}
    397 ; X86-NEXT:    retl
    398 ;
    399 ; X64-LABEL: test_mm_maskz_cvtps_epi32:
    400 ; X64:       # %bb.0: # %entry
    401 ; X64-NEXT:    kmovw %edi, %k1
    402 ; X64-NEXT:    vcvtps2dq %xmm0, %xmm0 {%k1} {z}
    403 ; X64-NEXT:    retq
    404 entry:
    405   %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
    406   %1 = bitcast i8 %__U to <8 x i1>
    407   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    408   %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
    409   %3 = bitcast <4 x i32> %2 to <2 x i64>
    410   ret <2 x i64> %3
    411 }
    412 
    413 define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
    414 ; X86-LABEL: test_mm256_mask_cvtps_epi32:
    415 ; X86:       # %bb.0: # %entry
    416 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    417 ; X86-NEXT:    kmovw %eax, %k1
    418 ; X86-NEXT:    vcvtps2dq %ymm1, %ymm0 {%k1}
    419 ; X86-NEXT:    retl
    420 ;
    421 ; X64-LABEL: test_mm256_mask_cvtps_epi32:
    422 ; X64:       # %bb.0: # %entry
    423 ; X64-NEXT:    kmovw %edi, %k1
    424 ; X64-NEXT:    vcvtps2dq %ymm1, %ymm0 {%k1}
    425 ; X64-NEXT:    retq
    426 entry:
    427   %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
    428   %1 = bitcast <4 x i64> %__W to <8 x i32>
    429   %2 = bitcast i8 %__U to <8 x i1>
    430   %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
    431   %4 = bitcast <8 x i32> %3 to <4 x i64>
    432   ret <4 x i64> %4
    433 }
    434 
    435 define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) {
    436 ; X86-LABEL: test_mm256_maskz_cvtps_epi32:
    437 ; X86:       # %bb.0: # %entry
    438 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    439 ; X86-NEXT:    kmovw %eax, %k1
    440 ; X86-NEXT:    vcvtps2dq %ymm0, %ymm0 {%k1} {z}
    441 ; X86-NEXT:    retl
    442 ;
    443 ; X64-LABEL: test_mm256_maskz_cvtps_epi32:
    444 ; X64:       # %bb.0: # %entry
    445 ; X64-NEXT:    kmovw %edi, %k1
    446 ; X64-NEXT:    vcvtps2dq %ymm0, %ymm0 {%k1} {z}
    447 ; X64-NEXT:    retq
    448 entry:
    449   %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
    450   %1 = bitcast i8 %__U to <8 x i1>
    451   %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
    452   %3 = bitcast <8 x i32> %2 to <4 x i64>
    453   ret <4 x i64> %3
    454 }
    455 
    456 define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
    457 ; X86-LABEL: test_mm_mask_cvtps_pd:
    458 ; X86:       # %bb.0: # %entry
    459 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    460 ; X86-NEXT:    kmovw %eax, %k1
    461 ; X86-NEXT:    vcvtps2pd %xmm1, %xmm0 {%k1}
    462 ; X86-NEXT:    retl
    463 ;
    464 ; X64-LABEL: test_mm_mask_cvtps_pd:
    465 ; X64:       # %bb.0: # %entry
    466 ; X64-NEXT:    kmovw %edi, %k1
    467 ; X64-NEXT:    vcvtps2pd %xmm1, %xmm0 {%k1}
    468 ; X64-NEXT:    retq
    469 entry:
    470   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
    471   %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
    472   %0 = bitcast i8 %__U to <8 x i1>
    473   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
    474   %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
    475   ret <2 x double> %1
    476 }
    477 
    478 define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
    479 ; X86-LABEL: test_mm_maskz_cvtps_pd:
    480 ; X86:       # %bb.0: # %entry
    481 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    482 ; X86-NEXT:    kmovw %eax, %k1
    483 ; X86-NEXT:    vcvtps2pd %xmm0, %xmm0 {%k1} {z}
    484 ; X86-NEXT:    retl
    485 ;
    486 ; X64-LABEL: test_mm_maskz_cvtps_pd:
    487 ; X64:       # %bb.0: # %entry
    488 ; X64-NEXT:    kmovw %edi, %k1
    489 ; X64-NEXT:    vcvtps2pd %xmm0, %xmm0 {%k1} {z}
    490 ; X64-NEXT:    retq
    491 entry:
    492   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
    493   %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
    494   %0 = bitcast i8 %__U to <8 x i1>
    495   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
    496   %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
    497   ret <2 x double> %1
    498 }
    499 
    500 define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
    501 ; X86-LABEL: test_mm256_mask_cvtps_pd:
    502 ; X86:       # %bb.0: # %entry
    503 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    504 ; X86-NEXT:    kmovw %eax, %k1
    505 ; X86-NEXT:    vcvtps2pd %xmm1, %ymm0 {%k1}
    506 ; X86-NEXT:    retl
    507 ;
    508 ; X64-LABEL: test_mm256_mask_cvtps_pd:
    509 ; X64:       # %bb.0: # %entry
    510 ; X64-NEXT:    kmovw %edi, %k1
    511 ; X64-NEXT:    vcvtps2pd %xmm1, %ymm0 {%k1}
    512 ; X64-NEXT:    retq
    513 entry:
    514   %conv.i.i = fpext <4 x float> %__A to <4 x double>
    515   %0 = bitcast i8 %__U to <8 x i1>
    516   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    517   %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
    518   ret <4 x double> %1
    519 }
    520 
    521 define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
    522 ; X86-LABEL: test_mm256_maskz_cvtps_pd:
    523 ; X86:       # %bb.0: # %entry
    524 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    525 ; X86-NEXT:    kmovw %eax, %k1
    526 ; X86-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
    527 ; X86-NEXT:    retl
    528 ;
    529 ; X64-LABEL: test_mm256_maskz_cvtps_pd:
    530 ; X64:       # %bb.0: # %entry
    531 ; X64-NEXT:    kmovw %edi, %k1
    532 ; X64-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
    533 ; X64-NEXT:    retq
    534 entry:
    535   %conv.i.i = fpext <4 x float> %__A to <4 x double>
    536   %0 = bitcast i8 %__U to <8 x i1>
    537   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    538   %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
    539   ret <4 x double> %1
    540 }
    541 
    542 define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) {
    543 ; CHECK-LABEL: test_mm_cvtps_epu32:
    544 ; CHECK:       # %bb.0: # %entry
    545 ; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm0
    546 ; CHECK-NEXT:    ret{{[l|q]}}
    547 entry:
    548   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
    549   %1 = bitcast <4 x i32> %0 to <2 x i64>
    550   ret <2 x i64> %1
    551 }
    552 
    553 define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
    554 ; X86-LABEL: test_mm_mask_cvtps_epu32:
    555 ; X86:       # %bb.0: # %entry
    556 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    557 ; X86-NEXT:    kmovw %eax, %k1
    558 ; X86-NEXT:    vcvtps2udq %xmm1, %xmm0 {%k1}
    559 ; X86-NEXT:    retl
    560 ;
    561 ; X64-LABEL: test_mm_mask_cvtps_epu32:
    562 ; X64:       # %bb.0: # %entry
    563 ; X64-NEXT:    kmovw %edi, %k1
    564 ; X64-NEXT:    vcvtps2udq %xmm1, %xmm0 {%k1}
    565 ; X64-NEXT:    retq
    566 entry:
    567   %0 = bitcast <2 x i64> %__W to <4 x i32>
    568   %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
    569   %2 = bitcast <4 x i32> %1 to <2 x i64>
    570   ret <2 x i64> %2
    571 }
    572 
    573 define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) {
    574 ; X86-LABEL: test_mm_maskz_cvtps_epu32:
    575 ; X86:       # %bb.0: # %entry
    576 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    577 ; X86-NEXT:    kmovw %eax, %k1
    578 ; X86-NEXT:    vcvtps2udq %xmm0, %xmm0 {%k1} {z}
    579 ; X86-NEXT:    retl
    580 ;
    581 ; X64-LABEL: test_mm_maskz_cvtps_epu32:
    582 ; X64:       # %bb.0: # %entry
    583 ; X64-NEXT:    kmovw %edi, %k1
    584 ; X64-NEXT:    vcvtps2udq %xmm0, %xmm0 {%k1} {z}
    585 ; X64-NEXT:    retq
    586 entry:
    587   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
    588   %1 = bitcast <4 x i32> %0 to <2 x i64>
    589   ret <2 x i64> %1
    590 }
    591 
    592 define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) {
    593 ; CHECK-LABEL: test_mm256_cvtps_epu32:
    594 ; CHECK:       # %bb.0: # %entry
    595 ; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm0
    596 ; CHECK-NEXT:    ret{{[l|q]}}
    597 entry:
    598   %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
    599   %1 = bitcast <8 x i32> %0 to <4 x i64>
    600   ret <4 x i64> %1
    601 }
    602 
    603 define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
    604 ; X86-LABEL: test_mm256_mask_cvtps_epu32:
    605 ; X86:       # %bb.0: # %entry
    606 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    607 ; X86-NEXT:    kmovw %eax, %k1
    608 ; X86-NEXT:    vcvtps2udq %ymm1, %ymm0 {%k1}
    609 ; X86-NEXT:    retl
    610 ;
    611 ; X64-LABEL: test_mm256_mask_cvtps_epu32:
    612 ; X64:       # %bb.0: # %entry
    613 ; X64-NEXT:    kmovw %edi, %k1
    614 ; X64-NEXT:    vcvtps2udq %ymm1, %ymm0 {%k1}
    615 ; X64-NEXT:    retq
    616 entry:
    617   %0 = bitcast <4 x i64> %__W to <8 x i32>
    618   %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
    619   %2 = bitcast <8 x i32> %1 to <4 x i64>
    620   ret <4 x i64> %2
    621 }
    622 
    623 define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) {
    624 ; X86-LABEL: test_mm256_maskz_cvtps_epu32:
    625 ; X86:       # %bb.0: # %entry
    626 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    627 ; X86-NEXT:    kmovw %eax, %k1
    628 ; X86-NEXT:    vcvtps2udq %ymm0, %ymm0 {%k1} {z}
    629 ; X86-NEXT:    retl
    630 ;
    631 ; X64-LABEL: test_mm256_maskz_cvtps_epu32:
    632 ; X64:       # %bb.0: # %entry
    633 ; X64-NEXT:    kmovw %edi, %k1
    634 ; X64-NEXT:    vcvtps2udq %ymm0, %ymm0 {%k1} {z}
    635 ; X64-NEXT:    retq
    636 entry:
    637   %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
    638   %1 = bitcast <8 x i32> %0 to <4 x i64>
    639   ret <4 x i64> %1
    640 }
    641 
    642 define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
    643 ; X86-LABEL: test_mm_mask_cvttpd_epi32:
    644 ; X86:       # %bb.0: # %entry
    645 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    646 ; X86-NEXT:    kmovw %eax, %k1
    647 ; X86-NEXT:    vcvttpd2dq %xmm1, %xmm0 {%k1}
    648 ; X86-NEXT:    retl
    649 ;
    650 ; X64-LABEL: test_mm_mask_cvttpd_epi32:
    651 ; X64:       # %bb.0: # %entry
    652 ; X64-NEXT:    kmovw %edi, %k1
    653 ; X64-NEXT:    vcvttpd2dq %xmm1, %xmm0 {%k1}
    654 ; X64-NEXT:    retq
    655 entry:
    656   %0 = bitcast <2 x i64> %__W to <4 x i32>
    657   %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
    658   %2 = bitcast <4 x i32> %1 to <2 x i64>
    659   ret <2 x i64> %2
    660 }
    661 
    662 define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
    663 ; X86-LABEL: test_mm_maskz_cvttpd_epi32:
    664 ; X86:       # %bb.0: # %entry
    665 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    666 ; X86-NEXT:    kmovw %eax, %k1
    667 ; X86-NEXT:    vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
    668 ; X86-NEXT:    retl
    669 ;
    670 ; X64-LABEL: test_mm_maskz_cvttpd_epi32:
    671 ; X64:       # %bb.0: # %entry
    672 ; X64-NEXT:    kmovw %edi, %k1
    673 ; X64-NEXT:    vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
    674 ; X64-NEXT:    retq
    675 entry:
    676   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
    677   %1 = bitcast <4 x i32> %0 to <2 x i64>
    678   ret <2 x i64> %1
    679 }
    680 
    681 define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
    682 ; X86-LABEL: test_mm256_mask_cvttpd_epi32:
    683 ; X86:       # %bb.0: # %entry
    684 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    685 ; X86-NEXT:    kmovw %eax, %k1
    686 ; X86-NEXT:    vcvttpd2dq %ymm1, %xmm0 {%k1}
    687 ; X86-NEXT:    vzeroupper
    688 ; X86-NEXT:    retl
    689 ;
    690 ; X64-LABEL: test_mm256_mask_cvttpd_epi32:
    691 ; X64:       # %bb.0: # %entry
    692 ; X64-NEXT:    kmovw %edi, %k1
    693 ; X64-NEXT:    vcvttpd2dq %ymm1, %xmm0 {%k1}
    694 ; X64-NEXT:    vzeroupper
    695 ; X64-NEXT:    retq
    696 entry:
    697   %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
    698   %1 = bitcast <2 x i64> %__W to <4 x i32>
    699   %2 = bitcast i8 %__U to <8 x i1>
    700   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    701   %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
    702   %4 = bitcast <4 x i32> %3 to <2 x i64>
    703   ret <2 x i64> %4
    704 }
    705 
    706 define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
    707 ; X86-LABEL: test_mm256_maskz_cvttpd_epi32:
    708 ; X86:       # %bb.0: # %entry
    709 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    710 ; X86-NEXT:    kmovw %eax, %k1
    711 ; X86-NEXT:    vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
    712 ; X86-NEXT:    vzeroupper
    713 ; X86-NEXT:    retl
    714 ;
    715 ; X64-LABEL: test_mm256_maskz_cvttpd_epi32:
    716 ; X64:       # %bb.0: # %entry
    717 ; X64-NEXT:    kmovw %edi, %k1
    718 ; X64-NEXT:    vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
    719 ; X64-NEXT:    vzeroupper
    720 ; X64-NEXT:    retq
    721 entry:
    722   %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
    723   %1 = bitcast i8 %__U to <8 x i1>
    724   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    725   %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
    726   %3 = bitcast <4 x i32> %2 to <2 x i64>
    727   ret <2 x i64> %3
    728 }
    729 
    730 define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) {
    731 ; CHECK-LABEL: test_mm_cvttpd_epu32:
    732 ; CHECK:       # %bb.0: # %entry
    733 ; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm0
    734 ; CHECK-NEXT:    ret{{[l|q]}}
    735 entry:
    736   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
    737   %1 = bitcast <4 x i32> %0 to <2 x i64>
    738   ret <2 x i64> %1
    739 }
    740 
    741 define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
    742 ; X86-LABEL: test_mm_mask_cvttpd_epu32:
    743 ; X86:       # %bb.0: # %entry
    744 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    745 ; X86-NEXT:    kmovw %eax, %k1
    746 ; X86-NEXT:    vcvttpd2udq %xmm1, %xmm0 {%k1}
    747 ; X86-NEXT:    retl
    748 ;
    749 ; X64-LABEL: test_mm_mask_cvttpd_epu32:
    750 ; X64:       # %bb.0: # %entry
    751 ; X64-NEXT:    kmovw %edi, %k1
    752 ; X64-NEXT:    vcvttpd2udq %xmm1, %xmm0 {%k1}
    753 ; X64-NEXT:    retq
    754 entry:
    755   %0 = bitcast <2 x i64> %__W to <4 x i32>
    756   %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
    757   %2 = bitcast <4 x i32> %1 to <2 x i64>
    758   ret <2 x i64> %2
    759 }
    760 
    761 define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
    762 ; X86-LABEL: test_mm_maskz_cvttpd_epu32:
    763 ; X86:       # %bb.0: # %entry
    764 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    765 ; X86-NEXT:    kmovw %eax, %k1
    766 ; X86-NEXT:    vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
    767 ; X86-NEXT:    retl
    768 ;
    769 ; X64-LABEL: test_mm_maskz_cvttpd_epu32:
    770 ; X64:       # %bb.0: # %entry
    771 ; X64-NEXT:    kmovw %edi, %k1
    772 ; X64-NEXT:    vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
    773 ; X64-NEXT:    retq
    774 entry:
    775   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
    776   %1 = bitcast <4 x i32> %0 to <2 x i64>
    777   ret <2 x i64> %1
    778 }
    779 
    780 define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) {
    781 ; CHECK-LABEL: test_mm256_cvttpd_epu32:
    782 ; CHECK:       # %bb.0: # %entry
    783 ; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm0
    784 ; CHECK-NEXT:    vzeroupper
    785 ; CHECK-NEXT:    ret{{[l|q]}}
    786 entry:
    787   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
    788   %1 = bitcast <4 x i32> %0 to <2 x i64>
    789   ret <2 x i64> %1
    790 }
    791 
    792 define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
    793 ; X86-LABEL: test_mm256_mask_cvttpd_epu32:
    794 ; X86:       # %bb.0: # %entry
    795 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    796 ; X86-NEXT:    kmovw %eax, %k1
    797 ; X86-NEXT:    vcvttpd2udq %ymm1, %xmm0 {%k1}
    798 ; X86-NEXT:    vzeroupper
    799 ; X86-NEXT:    retl
    800 ;
    801 ; X64-LABEL: test_mm256_mask_cvttpd_epu32:
    802 ; X64:       # %bb.0: # %entry
    803 ; X64-NEXT:    kmovw %edi, %k1
    804 ; X64-NEXT:    vcvttpd2udq %ymm1, %xmm0 {%k1}
    805 ; X64-NEXT:    vzeroupper
    806 ; X64-NEXT:    retq
    807 entry:
    808   %0 = bitcast <2 x i64> %__W to <4 x i32>
    809   %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
    810   %2 = bitcast <4 x i32> %1 to <2 x i64>
    811   ret <2 x i64> %2
    812 }
    813 
    814 define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
    815 ; X86-LABEL: test_mm256_maskz_cvttpd_epu32:
    816 ; X86:       # %bb.0: # %entry
    817 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    818 ; X86-NEXT:    kmovw %eax, %k1
    819 ; X86-NEXT:    vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
    820 ; X86-NEXT:    vzeroupper
    821 ; X86-NEXT:    retl
    822 ;
    823 ; X64-LABEL: test_mm256_maskz_cvttpd_epu32:
    824 ; X64:       # %bb.0: # %entry
    825 ; X64-NEXT:    kmovw %edi, %k1
    826 ; X64-NEXT:    vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
    827 ; X64-NEXT:    vzeroupper
    828 ; X64-NEXT:    retq
    829 entry:
    830   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
    831   %1 = bitcast <4 x i32> %0 to <2 x i64>
    832   ret <2 x i64> %1
    833 }
    834 
    835 define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
    836 ; X86-LABEL: test_mm_mask_cvttps_epi32:
    837 ; X86:       # %bb.0: # %entry
    838 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    839 ; X86-NEXT:    kmovw %eax, %k1
    840 ; X86-NEXT:    vcvttps2dq %xmm1, %xmm0 {%k1}
    841 ; X86-NEXT:    retl
    842 ;
    843 ; X64-LABEL: test_mm_mask_cvttps_epi32:
    844 ; X64:       # %bb.0: # %entry
    845 ; X64-NEXT:    kmovw %edi, %k1
    846 ; X64-NEXT:    vcvttps2dq %xmm1, %xmm0 {%k1}
    847 ; X64-NEXT:    retq
    848 entry:
    849   %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
    850   %1 = bitcast <2 x i64> %__W to <4 x i32>
    851   %2 = bitcast i8 %__U to <8 x i1>
    852   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    853   %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
    854   %4 = bitcast <4 x i32> %3 to <2 x i64>
    855   ret <2 x i64> %4
    856 }
    857 
    858 define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) {
    859 ; X86-LABEL: test_mm_maskz_cvttps_epi32:
    860 ; X86:       # %bb.0: # %entry
    861 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    862 ; X86-NEXT:    kmovw %eax, %k1
    863 ; X86-NEXT:    vcvttps2dq %xmm0, %xmm0 {%k1} {z}
    864 ; X86-NEXT:    retl
    865 ;
    866 ; X64-LABEL: test_mm_maskz_cvttps_epi32:
    867 ; X64:       # %bb.0: # %entry
    868 ; X64-NEXT:    kmovw %edi, %k1
    869 ; X64-NEXT:    vcvttps2dq %xmm0, %xmm0 {%k1} {z}
    870 ; X64-NEXT:    retq
    871 entry:
    872   %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
    873   %1 = bitcast i8 %__U to <8 x i1>
    874   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    875   %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
    876   %3 = bitcast <4 x i32> %2 to <2 x i64>
    877   ret <2 x i64> %3
    878 }
    879 
    880 define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
    881 ; X86-LABEL: test_mm256_mask_cvttps_epi32:
    882 ; X86:       # %bb.0: # %entry
    883 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    884 ; X86-NEXT:    kmovw %eax, %k1
    885 ; X86-NEXT:    vcvttps2dq %ymm1, %ymm0 {%k1}
    886 ; X86-NEXT:    retl
    887 ;
    888 ; X64-LABEL: test_mm256_mask_cvttps_epi32:
    889 ; X64:       # %bb.0: # %entry
    890 ; X64-NEXT:    kmovw %edi, %k1
    891 ; X64-NEXT:    vcvttps2dq %ymm1, %ymm0 {%k1}
    892 ; X64-NEXT:    retq
    893 entry:
    894   %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
    895   %1 = bitcast <4 x i64> %__W to <8 x i32>
    896   %2 = bitcast i8 %__U to <8 x i1>
    897   %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
    898   %4 = bitcast <8 x i32> %3 to <4 x i64>
    899   ret <4 x i64> %4
    900 }
    901 
    902 define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) {
    903 ; X86-LABEL: test_mm256_maskz_cvttps_epi32:
    904 ; X86:       # %bb.0: # %entry
    905 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    906 ; X86-NEXT:    kmovw %eax, %k1
    907 ; X86-NEXT:    vcvttps2dq %ymm0, %ymm0 {%k1} {z}
    908 ; X86-NEXT:    retl
    909 ;
    910 ; X64-LABEL: test_mm256_maskz_cvttps_epi32:
    911 ; X64:       # %bb.0: # %entry
    912 ; X64-NEXT:    kmovw %edi, %k1
    913 ; X64-NEXT:    vcvttps2dq %ymm0, %ymm0 {%k1} {z}
    914 ; X64-NEXT:    retq
    915 entry:
    916   %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
    917   %1 = bitcast i8 %__U to <8 x i1>
    918   %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
    919   %3 = bitcast <8 x i32> %2 to <4 x i64>
    920   ret <4 x i64> %3
    921 }
    922 
    923 define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) {
    924 ; CHECK-LABEL: test_mm_cvttps_epu32:
    925 ; CHECK:       # %bb.0: # %entry
    926 ; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm0
    927 ; CHECK-NEXT:    ret{{[l|q]}}
    928 entry:
    929   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
    930   %1 = bitcast <4 x i32> %0 to <2 x i64>
    931   ret <2 x i64> %1
    932 }
    933 
    934 define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
    935 ; X86-LABEL: test_mm_mask_cvttps_epu32:
    936 ; X86:       # %bb.0: # %entry
    937 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    938 ; X86-NEXT:    kmovw %eax, %k1
    939 ; X86-NEXT:    vcvttps2udq %xmm1, %xmm0 {%k1}
    940 ; X86-NEXT:    retl
    941 ;
    942 ; X64-LABEL: test_mm_mask_cvttps_epu32:
    943 ; X64:       # %bb.0: # %entry
    944 ; X64-NEXT:    kmovw %edi, %k1
    945 ; X64-NEXT:    vcvttps2udq %xmm1, %xmm0 {%k1}
    946 ; X64-NEXT:    retq
    947 entry:
    948   %0 = bitcast <2 x i64> %__W to <4 x i32>
    949   %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
    950   %2 = bitcast <4 x i32> %1 to <2 x i64>
    951   ret <2 x i64> %2
    952 }
    953 
    954 define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) {
    955 ; X86-LABEL: test_mm_maskz_cvttps_epu32:
    956 ; X86:       # %bb.0: # %entry
    957 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    958 ; X86-NEXT:    kmovw %eax, %k1
    959 ; X86-NEXT:    vcvttps2udq %xmm0, %xmm0 {%k1} {z}
    960 ; X86-NEXT:    retl
    961 ;
    962 ; X64-LABEL: test_mm_maskz_cvttps_epu32:
    963 ; X64:       # %bb.0: # %entry
    964 ; X64-NEXT:    kmovw %edi, %k1
    965 ; X64-NEXT:    vcvttps2udq %xmm0, %xmm0 {%k1} {z}
    966 ; X64-NEXT:    retq
    967 entry:
    968   %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
    969   %1 = bitcast <4 x i32> %0 to <2 x i64>
    970   ret <2 x i64> %1
    971 }
    972 
    973 define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) {
    974 ; CHECK-LABEL: test_mm256_cvttps_epu32:
    975 ; CHECK:       # %bb.0: # %entry
    976 ; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm0
    977 ; CHECK-NEXT:    ret{{[l|q]}}
    978 entry:
    979   %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
    980   %1 = bitcast <8 x i32> %0 to <4 x i64>
    981   ret <4 x i64> %1
    982 }
    983 
    984 define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
    985 ; X86-LABEL: test_mm256_mask_cvttps_epu32:
    986 ; X86:       # %bb.0: # %entry
    987 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    988 ; X86-NEXT:    kmovw %eax, %k1
    989 ; X86-NEXT:    vcvttps2udq %ymm1, %ymm0 {%k1}
    990 ; X86-NEXT:    retl
    991 ;
    992 ; X64-LABEL: test_mm256_mask_cvttps_epu32:
    993 ; X64:       # %bb.0: # %entry
    994 ; X64-NEXT:    kmovw %edi, %k1
    995 ; X64-NEXT:    vcvttps2udq %ymm1, %ymm0 {%k1}
    996 ; X64-NEXT:    retq
    997 entry:
    998   %0 = bitcast <4 x i64> %__W to <8 x i32>
    999   %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
   1000   %2 = bitcast <8 x i32> %1 to <4 x i64>
   1001   ret <4 x i64> %2
   1002 }
   1003 
   1004 define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) {
   1005 ; X86-LABEL: test_mm256_maskz_cvttps_epu32:
   1006 ; X86:       # %bb.0: # %entry
   1007 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1008 ; X86-NEXT:    kmovw %eax, %k1
   1009 ; X86-NEXT:    vcvttps2udq %ymm0, %ymm0 {%k1} {z}
   1010 ; X86-NEXT:    retl
   1011 ;
   1012 ; X64-LABEL: test_mm256_maskz_cvttps_epu32:
   1013 ; X64:       # %bb.0: # %entry
   1014 ; X64-NEXT:    kmovw %edi, %k1
   1015 ; X64-NEXT:    vcvttps2udq %ymm0, %ymm0 {%k1} {z}
   1016 ; X64-NEXT:    retq
   1017 entry:
   1018   %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
   1019   %1 = bitcast <8 x i32> %0 to <4 x i64>
   1020   ret <4 x i64> %1
   1021 }
   1022 
   1023 define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
   1024 ; CHECK-LABEL: test_mm_cvtepu32_pd:
   1025 ; CHECK:       # %bb.0: # %entry
   1026 ; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm0
   1027 ; CHECK-NEXT:    ret{{[l|q]}}
   1028 entry:
   1029   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1030   %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   1031   %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double>
   1032   ret <2 x double> %conv.i
   1033 }
   1034 
   1035 define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
   1036 ; X86-LABEL: test_mm_mask_cvtepu32_pd:
   1037 ; X86:       # %bb.0: # %entry
   1038 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1039 ; X86-NEXT:    kmovw %eax, %k1
   1040 ; X86-NEXT:    vcvtudq2pd %xmm1, %xmm0 {%k1}
   1041 ; X86-NEXT:    retl
   1042 ;
   1043 ; X64-LABEL: test_mm_mask_cvtepu32_pd:
   1044 ; X64:       # %bb.0: # %entry
   1045 ; X64-NEXT:    kmovw %edi, %k1
   1046 ; X64-NEXT:    vcvtudq2pd %xmm1, %xmm0 {%k1}
   1047 ; X64-NEXT:    retq
   1048 entry:
   1049   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1050   %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   1051   %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
   1052   %1 = bitcast i8 %__U to <8 x i1>
   1053   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   1054   %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
   1055   ret <2 x double> %2
   1056 }
   1057 
   1058 define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
   1059 ; X86-LABEL: test_mm_maskz_cvtepu32_pd:
   1060 ; X86:       # %bb.0: # %entry
   1061 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1062 ; X86-NEXT:    kmovw %eax, %k1
   1063 ; X86-NEXT:    vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
   1064 ; X86-NEXT:    retl
   1065 ;
   1066 ; X64-LABEL: test_mm_maskz_cvtepu32_pd:
   1067 ; X64:       # %bb.0: # %entry
   1068 ; X64-NEXT:    kmovw %edi, %k1
   1069 ; X64-NEXT:    vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
   1070 ; X64-NEXT:    retq
   1071 entry:
   1072   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1073   %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   1074   %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
   1075   %1 = bitcast i8 %__U to <8 x i1>
   1076   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   1077   %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
   1078   ret <2 x double> %2
   1079 }
   1080 
   1081 define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
   1082 ; CHECK-LABEL: test_mm256_cvtepu32_pd:
   1083 ; CHECK:       # %bb.0: # %entry
   1084 ; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm0
   1085 ; CHECK-NEXT:    ret{{[l|q]}}
   1086 entry:
   1087   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1088   %conv.i = uitofp <4 x i32> %0 to <4 x double>
   1089   ret <4 x double> %conv.i
   1090 }
   1091 
   1092 define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
   1093 ; X86-LABEL: test_mm256_mask_cvtepu32_pd:
   1094 ; X86:       # %bb.0: # %entry
   1095 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1096 ; X86-NEXT:    kmovw %eax, %k1
   1097 ; X86-NEXT:    vcvtudq2pd %xmm1, %ymm0 {%k1}
   1098 ; X86-NEXT:    retl
   1099 ;
   1100 ; X64-LABEL: test_mm256_mask_cvtepu32_pd:
   1101 ; X64:       # %bb.0: # %entry
   1102 ; X64-NEXT:    kmovw %edi, %k1
   1103 ; X64-NEXT:    vcvtudq2pd %xmm1, %ymm0 {%k1}
   1104 ; X64-NEXT:    retq
   1105 entry:
   1106   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1107   %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
   1108   %1 = bitcast i8 %__U to <8 x i1>
   1109   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1110   %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
   1111   ret <4 x double> %2
   1112 }
   1113 
   1114 define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
   1115 ; X86-LABEL: test_mm256_maskz_cvtepu32_pd:
   1116 ; X86:       # %bb.0: # %entry
   1117 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1118 ; X86-NEXT:    kmovw %eax, %k1
   1119 ; X86-NEXT:    vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
   1120 ; X86-NEXT:    retl
   1121 ;
   1122 ; X64-LABEL: test_mm256_maskz_cvtepu32_pd:
   1123 ; X64:       # %bb.0: # %entry
   1124 ; X64-NEXT:    kmovw %edi, %k1
   1125 ; X64-NEXT:    vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
   1126 ; X64-NEXT:    retq
   1127 entry:
   1128   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1129   %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
   1130   %1 = bitcast i8 %__U to <8 x i1>
   1131   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1132   %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
   1133   ret <4 x double> %2
   1134 }
   1135 
   1136 define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) {
   1137 ; CHECK-LABEL: test_mm_cvtepu32_ps:
   1138 ; CHECK:       # %bb.0: # %entry
   1139 ; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
   1140 ; CHECK-NEXT:    ret{{[l|q]}}
   1141 entry:
   1142   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1143   %conv.i = uitofp <4 x i32> %0 to <4 x float>
   1144   ret <4 x float> %conv.i
   1145 }
   1146 
   1147 define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
   1148 ; X86-LABEL: test_mm_mask_cvtepu32_ps:
   1149 ; X86:       # %bb.0: # %entry
   1150 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1151 ; X86-NEXT:    kmovw %eax, %k1
   1152 ; X86-NEXT:    vcvtudq2ps %xmm1, %xmm0 {%k1}
   1153 ; X86-NEXT:    retl
   1154 ;
   1155 ; X64-LABEL: test_mm_mask_cvtepu32_ps:
   1156 ; X64:       # %bb.0: # %entry
   1157 ; X64-NEXT:    kmovw %edi, %k1
   1158 ; X64-NEXT:    vcvtudq2ps %xmm1, %xmm0 {%k1}
   1159 ; X64-NEXT:    retq
   1160 entry:
   1161   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1162   %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
   1163   %1 = bitcast i8 %__U to <8 x i1>
   1164   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1165   %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
   1166   ret <4 x float> %2
   1167 }
   1168 
   1169 define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) {
   1170 ; X86-LABEL: test_mm_maskz_cvtepu32_ps:
   1171 ; X86:       # %bb.0: # %entry
   1172 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1173 ; X86-NEXT:    kmovw %eax, %k1
   1174 ; X86-NEXT:    vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
   1175 ; X86-NEXT:    retl
   1176 ;
   1177 ; X64-LABEL: test_mm_maskz_cvtepu32_ps:
   1178 ; X64:       # %bb.0: # %entry
   1179 ; X64-NEXT:    kmovw %edi, %k1
   1180 ; X64-NEXT:    vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
   1181 ; X64-NEXT:    retq
   1182 entry:
   1183   %0 = bitcast <2 x i64> %__A to <4 x i32>
   1184   %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
   1185   %1 = bitcast i8 %__U to <8 x i1>
   1186   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1187   %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
   1188   ret <4 x float> %2
   1189 }
   1190 
   1191 define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) {
   1192 ; CHECK-LABEL: test_mm256_cvtepu32_ps:
   1193 ; CHECK:       # %bb.0: # %entry
   1194 ; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm0
   1195 ; CHECK-NEXT:    ret{{[l|q]}}
   1196 entry:
   1197   %0 = bitcast <4 x i64> %__A to <8 x i32>
   1198   %conv.i = uitofp <8 x i32> %0 to <8 x float>
   1199   ret <8 x float> %conv.i
   1200 }
   1201 
   1202 define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
   1203 ; X86-LABEL: test_mm256_mask_cvtepu32_ps:
   1204 ; X86:       # %bb.0: # %entry
   1205 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1206 ; X86-NEXT:    kmovw %eax, %k1
   1207 ; X86-NEXT:    vcvtudq2ps %ymm1, %ymm0 {%k1}
   1208 ; X86-NEXT:    retl
   1209 ;
   1210 ; X64-LABEL: test_mm256_mask_cvtepu32_ps:
   1211 ; X64:       # %bb.0: # %entry
   1212 ; X64-NEXT:    kmovw %edi, %k1
   1213 ; X64-NEXT:    vcvtudq2ps %ymm1, %ymm0 {%k1}
   1214 ; X64-NEXT:    retq
   1215 entry:
   1216   %0 = bitcast <4 x i64> %__A to <8 x i32>
   1217   %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
   1218   %1 = bitcast i8 %__U to <8 x i1>
   1219   %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
   1220   ret <8 x float> %2
   1221 }
   1222 
   1223 define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) {
   1224 ; X86-LABEL: test_mm256_maskz_cvtepu32_ps:
   1225 ; X86:       # %bb.0: # %entry
   1226 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1227 ; X86-NEXT:    kmovw %eax, %k1
   1228 ; X86-NEXT:    vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
   1229 ; X86-NEXT:    retl
   1230 ;
   1231 ; X64-LABEL: test_mm256_maskz_cvtepu32_ps:
   1232 ; X64:       # %bb.0: # %entry
   1233 ; X64-NEXT:    kmovw %edi, %k1
   1234 ; X64-NEXT:    vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
   1235 ; X64-NEXT:    retq
   1236 entry:
   1237   %0 = bitcast <4 x i64> %__A to <8 x i32>
   1238   %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
   1239   %1 = bitcast i8 %__U to <8 x i1>
   1240   %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
   1241   ret <8 x float> %2
   1242 }
   1243 
   1244 define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) {
   1245 ; CHECK-LABEL: test_mm256_shuffle_f32x4:
   1246 ; CHECK:       # %bb.0: # %entry
   1247 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1248 ; CHECK-NEXT:    ret{{[l|q]}}
   1249 entry:
   1250   %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1251   ret <8 x float> %shuffle
   1252 }
   1253 
   1254 define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
   1255 ; X86-LABEL: test_mm256_mask_shuffle_f32x4:
   1256 ; X86:       # %bb.0: # %entry
   1257 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1258 ; X86-NEXT:    kmovw %eax, %k1
   1259 ; X86-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
   1260 ; X86-NEXT:    retl
   1261 ;
   1262 ; X64-LABEL: test_mm256_mask_shuffle_f32x4:
   1263 ; X64:       # %bb.0: # %entry
   1264 ; X64-NEXT:    kmovw %edi, %k1
   1265 ; X64-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
   1266 ; X64-NEXT:    retq
   1267 entry:
   1268   %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1269   %0 = bitcast i8 %__U to <8 x i1>
   1270   %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W
   1271   ret <8 x float> %1
   1272 }
   1273 
   1274 define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
   1275 ; X86-LABEL: test_mm256_maskz_shuffle_f32x4:
   1276 ; X86:       # %bb.0: # %entry
   1277 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1278 ; X86-NEXT:    kmovw %eax, %k1
   1279 ; X86-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
   1280 ; X86-NEXT:    retl
   1281 ;
   1282 ; X64-LABEL: test_mm256_maskz_shuffle_f32x4:
   1283 ; X64:       # %bb.0: # %entry
   1284 ; X64-NEXT:    kmovw %edi, %k1
   1285 ; X64-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
   1286 ; X64-NEXT:    retq
   1287 entry:
   1288   %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   1289   %0 = bitcast i8 %__U to <8 x i1>
   1290   %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer
   1291   ret <8 x float> %1
   1292 }
   1293 
   1294 define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) {
   1295 ; CHECK-LABEL: test_mm256_shuffle_f64x2:
   1296 ; CHECK:       # %bb.0: # %entry
   1297 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1298 ; CHECK-NEXT:    ret{{[l|q]}}
   1299 entry:
   1300   %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1301   ret <4 x double> %shuffle
   1302 }
   1303 
   1304 define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
   1305 ; X86-LABEL: test_mm256_mask_shuffle_f64x2:
   1306 ; X86:       # %bb.0: # %entry
   1307 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1308 ; X86-NEXT:    kmovw %eax, %k1
   1309 ; X86-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
   1310 ; X86-NEXT:    retl
   1311 ;
   1312 ; X64-LABEL: test_mm256_mask_shuffle_f64x2:
   1313 ; X64:       # %bb.0: # %entry
   1314 ; X64-NEXT:    kmovw %edi, %k1
   1315 ; X64-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
   1316 ; X64-NEXT:    retq
   1317 entry:
   1318   %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1319   %0 = bitcast i8 %__U to <8 x i1>
   1320   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1321   %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W
   1322   ret <4 x double> %1
   1323 }
   1324 
   1325 define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
   1326 ; X86-LABEL: test_mm256_maskz_shuffle_f64x2:
   1327 ; X86:       # %bb.0: # %entry
   1328 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1329 ; X86-NEXT:    kmovw %eax, %k1
   1330 ; X86-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
   1331 ; X86-NEXT:    retl
   1332 ;
   1333 ; X64-LABEL: test_mm256_maskz_shuffle_f64x2:
   1334 ; X64:       # %bb.0: # %entry
   1335 ; X64-NEXT:    kmovw %edi, %k1
   1336 ; X64-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
   1337 ; X64-NEXT:    retq
   1338 entry:
   1339   %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1340   %0 = bitcast i8 %__U to <8 x i1>
   1341   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1342   %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer
   1343   ret <4 x double> %1
   1344 }
   1345 
   1346 define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) {
   1347 ; CHECK-LABEL: test_mm256_shuffle_i32x4:
   1348 ; CHECK:       # %bb.0: # %entry
   1349 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1350 ; CHECK-NEXT:    ret{{[l|q]}}
   1351 entry:
   1352   %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1353   ret <4 x i64> %shuffle
   1354 }
   1355 
   1356 define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   1357 ; X86-LABEL: test_mm256_mask_shuffle_i32x4:
   1358 ; X86:       # %bb.0: # %entry
   1359 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1360 ; X86-NEXT:    kmovw %eax, %k1
   1361 ; X86-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
   1362 ; X86-NEXT:    retl
   1363 ;
   1364 ; X64-LABEL: test_mm256_mask_shuffle_i32x4:
   1365 ; X64:       # %bb.0: # %entry
   1366 ; X64-NEXT:    kmovw %edi, %k1
   1367 ; X64-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
   1368 ; X64-NEXT:    retq
   1369 entry:
   1370   %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1371   %0 = bitcast <4 x i64> %shuffle to <8 x i32>
   1372   %1 = bitcast <4 x i64> %__W to <8 x i32>
   1373   %2 = bitcast i8 %__U to <8 x i1>
   1374   %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
   1375   %4 = bitcast <8 x i32> %3 to <4 x i64>
   1376   ret <4 x i64> %4
   1377 }
   1378 
   1379 define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   1380 ; X86-LABEL: test_mm256_maskz_shuffle_i32x4:
   1381 ; X86:       # %bb.0: # %entry
   1382 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1383 ; X86-NEXT:    kmovw %eax, %k1
   1384 ; X86-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
   1385 ; X86-NEXT:    retl
   1386 ;
   1387 ; X64-LABEL: test_mm256_maskz_shuffle_i32x4:
   1388 ; X64:       # %bb.0: # %entry
   1389 ; X64-NEXT:    kmovw %edi, %k1
   1390 ; X64-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
   1391 ; X64-NEXT:    retq
   1392 entry:
   1393   %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1394   %0 = bitcast <4 x i64> %shuffle to <8 x i32>
   1395   %1 = bitcast i8 %__U to <8 x i1>
   1396   %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
   1397   %3 = bitcast <8 x i32> %2 to <4 x i64>
   1398   ret <4 x i64> %3
   1399 }
   1400 
   1401 define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) {
   1402 ; CHECK-LABEL: test_mm256_shuffle_i64x2:
   1403 ; CHECK:       # %bb.0: # %entry
   1404 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1405 ; CHECK-NEXT:    ret{{[l|q]}}
   1406 entry:
   1407   %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1408   ret <4 x i64> %shuffle
   1409 }
   1410 
   1411 define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   1412 ; X86-LABEL: test_mm256_mask_shuffle_i64x2:
   1413 ; X86:       # %bb.0: # %entry
   1414 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1415 ; X86-NEXT:    kmovw %eax, %k1
   1416 ; X86-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
   1417 ; X86-NEXT:    retl
   1418 ;
   1419 ; X64-LABEL: test_mm256_mask_shuffle_i64x2:
   1420 ; X64:       # %bb.0: # %entry
   1421 ; X64-NEXT:    kmovw %edi, %k1
   1422 ; X64-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
   1423 ; X64-NEXT:    retq
   1424 entry:
   1425   %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1426   %0 = bitcast i8 %__U to <8 x i1>
   1427   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1428   %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W
   1429   ret <4 x i64> %1
   1430 }
   1431 
   1432 define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   1433 ; X86-LABEL: test_mm256_maskz_shuffle_i64x2:
   1434 ; X86:       # %bb.0: # %entry
   1435 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1436 ; X86-NEXT:    kmovw %eax, %k1
   1437 ; X86-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
   1438 ; X86-NEXT:    retl
   1439 ;
   1440 ; X64-LABEL: test_mm256_maskz_shuffle_i64x2:
   1441 ; X64:       # %bb.0: # %entry
   1442 ; X64-NEXT:    kmovw %edi, %k1
   1443 ; X64-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
   1444 ; X64-NEXT:    retq
   1445 entry:
   1446   %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1447   %0 = bitcast i8 %__U to <8 x i1>
   1448   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1449   %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
   1450   ret <4 x i64> %1
   1451 }
   1452 
   1453 define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
   1454 ; CHECK-LABEL: test_mm_test_epi32_mask:
   1455 ; CHECK:       # %bb.0: # %entry
   1456 ; CHECK-NEXT:    vptestmd %xmm0, %xmm1, %k0
   1457 ; CHECK-NEXT:    kmovw %k0, %eax
   1458 ; CHECK-NEXT:    movzbl %al, %eax
   1459 ; CHECK-NEXT:    ret{{[l|q]}}
   1460 entry:
   1461   %and.i.i = and <2 x i64> %__B, %__A
   1462   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
   1463   %1 = icmp ne <4 x i32> %0, zeroinitializer
   1464   %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1465   %3 = bitcast <8 x i1> %2 to i8
   1466   ret i8 %3
   1467 }
   1468 
   1469 define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   1470 ; X86-LABEL: test_mm_mask_test_epi32_mask:
   1471 ; X86:       # %bb.0: # %entry
   1472 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1473 ; X86-NEXT:    kmovw %eax, %k1
   1474 ; X86-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
   1475 ; X86-NEXT:    kmovw %k0, %eax
   1476 ; X86-NEXT:    movzbl %al, %eax
   1477 ; X86-NEXT:    retl
   1478 ;
   1479 ; X64-LABEL: test_mm_mask_test_epi32_mask:
   1480 ; X64:       # %bb.0: # %entry
   1481 ; X64-NEXT:    kmovw %edi, %k1
   1482 ; X64-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
   1483 ; X64-NEXT:    kmovw %k0, %eax
   1484 ; X64-NEXT:    movzbl %al, %eax
   1485 ; X64-NEXT:    retq
   1486 entry:
   1487   %and.i.i = and <2 x i64> %__B, %__A
   1488   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
   1489   %1 = icmp ne <4 x i32> %0, zeroinitializer
   1490   %2 = bitcast i8 %__U to <8 x i1>
   1491   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1492   %3 = and <4 x i1> %1, %extract.i
   1493   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1494   %5 = bitcast <8 x i1> %4 to i8
   1495   ret i8 %5
   1496 }
   1497 
   1498 define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
   1499 ; CHECK-LABEL: test_mm256_test_epi32_mask:
   1500 ; CHECK:       # %bb.0: # %entry
   1501 ; CHECK-NEXT:    vptestmd %ymm0, %ymm1, %k0
   1502 ; CHECK-NEXT:    kmovw %k0, %eax
   1503 ; CHECK-NEXT:    movzbl %al, %eax
   1504 ; CHECK-NEXT:    vzeroupper
   1505 ; CHECK-NEXT:    ret{{[l|q]}}
   1506 entry:
   1507   %and.i.i = and <4 x i64> %__B, %__A
   1508   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
   1509   %1 = icmp ne <8 x i32> %0, zeroinitializer
   1510   %2 = bitcast <8 x i1> %1 to i8
   1511   ret i8 %2
   1512 }
   1513 
   1514 define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   1515 ; X86-LABEL: test_mm256_mask_test_epi32_mask:
   1516 ; X86:       # %bb.0: # %entry
   1517 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1518 ; X86-NEXT:    kmovw %eax, %k1
   1519 ; X86-NEXT:    vptestmd %ymm0, %ymm1, %k0 {%k1}
   1520 ; X86-NEXT:    kmovw %k0, %eax
   1521 ; X86-NEXT:    movzbl %al, %eax
   1522 ; X86-NEXT:    vzeroupper
   1523 ; X86-NEXT:    retl
   1524 ;
   1525 ; X64-LABEL: test_mm256_mask_test_epi32_mask:
   1526 ; X64:       # %bb.0: # %entry
   1527 ; X64-NEXT:    kmovw %edi, %k1
   1528 ; X64-NEXT:    vptestmd %ymm0, %ymm1, %k0 {%k1}
   1529 ; X64-NEXT:    kmovw %k0, %eax
   1530 ; X64-NEXT:    movzbl %al, %eax
   1531 ; X64-NEXT:    vzeroupper
   1532 ; X64-NEXT:    retq
   1533 entry:
   1534   %and.i.i = and <4 x i64> %__B, %__A
   1535   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
   1536   %1 = icmp ne <8 x i32> %0, zeroinitializer
   1537   %2 = bitcast i8 %__U to <8 x i1>
   1538   %3 = and <8 x i1> %1, %2
   1539   %4 = bitcast <8 x i1> %3 to i8
   1540   ret i8 %4
   1541 }
   1542 
   1543 define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
   1544 ; CHECK-LABEL: test_mm_test_epi64_mask:
   1545 ; CHECK:       # %bb.0: # %entry
   1546 ; CHECK-NEXT:    vptestmq %xmm0, %xmm1, %k0
   1547 ; CHECK-NEXT:    kmovw %k0, %eax
   1548 ; CHECK-NEXT:    movzbl %al, %eax
   1549 ; CHECK-NEXT:    ret{{[l|q]}}
   1550 entry:
   1551   %and.i.i = and <2 x i64> %__B, %__A
   1552   %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
   1553   %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   1554   %2 = bitcast <8 x i1> %1 to i8
   1555   ret i8 %2
   1556 }
   1557 
   1558 define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   1559 ; X86-LABEL: test_mm_mask_test_epi64_mask:
   1560 ; X86:       # %bb.0: # %entry
   1561 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1562 ; X86-NEXT:    kmovw %eax, %k1
   1563 ; X86-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
   1564 ; X86-NEXT:    kmovw %k0, %eax
   1565 ; X86-NEXT:    movzbl %al, %eax
   1566 ; X86-NEXT:    retl
   1567 ;
   1568 ; X64-LABEL: test_mm_mask_test_epi64_mask:
   1569 ; X64:       # %bb.0: # %entry
   1570 ; X64-NEXT:    kmovw %edi, %k1
   1571 ; X64-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
   1572 ; X64-NEXT:    kmovw %k0, %eax
   1573 ; X64-NEXT:    movzbl %al, %eax
   1574 ; X64-NEXT:    retq
   1575 entry:
   1576   %and.i.i = and <2 x i64> %__B, %__A
   1577   %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
   1578   %1 = bitcast i8 %__U to <8 x i1>
   1579   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   1580   %2 = and <2 x i1> %0, %extract.i
   1581   %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   1582   %4 = bitcast <8 x i1> %3 to i8
   1583   ret i8 %4
   1584 }
   1585 
   1586 define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
   1587 ; CHECK-LABEL: test_mm256_test_epi64_mask:
   1588 ; CHECK:       # %bb.0: # %entry
   1589 ; CHECK-NEXT:    vptestmq %ymm0, %ymm1, %k0
   1590 ; CHECK-NEXT:    kmovw %k0, %eax
   1591 ; CHECK-NEXT:    movzbl %al, %eax
   1592 ; CHECK-NEXT:    vzeroupper
   1593 ; CHECK-NEXT:    ret{{[l|q]}}
   1594 entry:
   1595   %and.i.i = and <4 x i64> %__B, %__A
   1596   %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
   1597   %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1598   %2 = bitcast <8 x i1> %1 to i8
   1599   ret i8 %2
   1600 }
   1601 
   1602 define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   1603 ; X86-LABEL: test_mm256_mask_test_epi64_mask:
   1604 ; X86:       # %bb.0: # %entry
   1605 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1606 ; X86-NEXT:    kmovw %eax, %k1
   1607 ; X86-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
   1608 ; X86-NEXT:    kmovw %k0, %eax
   1609 ; X86-NEXT:    movzbl %al, %eax
   1610 ; X86-NEXT:    vzeroupper
   1611 ; X86-NEXT:    retl
   1612 ;
   1613 ; X64-LABEL: test_mm256_mask_test_epi64_mask:
   1614 ; X64:       # %bb.0: # %entry
   1615 ; X64-NEXT:    kmovw %edi, %k1
   1616 ; X64-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
   1617 ; X64-NEXT:    kmovw %k0, %eax
   1618 ; X64-NEXT:    movzbl %al, %eax
   1619 ; X64-NEXT:    vzeroupper
   1620 ; X64-NEXT:    retq
   1621 entry:
   1622   %and.i.i = and <4 x i64> %__B, %__A
   1623   %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
   1624   %1 = bitcast i8 %__U to <8 x i1>
   1625   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1626   %2 = and <4 x i1> %0, %extract.i
   1627   %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1628   %4 = bitcast <8 x i1> %3 to i8
   1629   ret i8 %4
   1630 }
   1631 
   1632 define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
   1633 ; CHECK-LABEL: test_mm_testn_epi32_mask:
   1634 ; CHECK:       # %bb.0: # %entry
   1635 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm1, %k0
   1636 ; CHECK-NEXT:    kmovw %k0, %eax
   1637 ; CHECK-NEXT:    movzbl %al, %eax
   1638 ; CHECK-NEXT:    ret{{[l|q]}}
   1639 entry:
   1640   %and.i.i = and <2 x i64> %__B, %__A
   1641   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
   1642   %1 = icmp eq <4 x i32> %0, zeroinitializer
   1643   %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1644   %3 = bitcast <8 x i1> %2 to i8
   1645   ret i8 %3
   1646 }
   1647 
   1648 define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   1649 ; X86-LABEL: test_mm_mask_testn_epi32_mask:
   1650 ; X86:       # %bb.0: # %entry
   1651 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1652 ; X86-NEXT:    kmovw %eax, %k1
   1653 ; X86-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
   1654 ; X86-NEXT:    kmovw %k0, %eax
   1655 ; X86-NEXT:    movzbl %al, %eax
   1656 ; X86-NEXT:    retl
   1657 ;
   1658 ; X64-LABEL: test_mm_mask_testn_epi32_mask:
   1659 ; X64:       # %bb.0: # %entry
   1660 ; X64-NEXT:    kmovw %edi, %k1
   1661 ; X64-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
   1662 ; X64-NEXT:    kmovw %k0, %eax
   1663 ; X64-NEXT:    movzbl %al, %eax
   1664 ; X64-NEXT:    retq
   1665 entry:
   1666   %and.i.i = and <2 x i64> %__B, %__A
   1667   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
   1668   %1 = icmp eq <4 x i32> %0, zeroinitializer
   1669   %2 = bitcast i8 %__U to <8 x i1>
   1670   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1671   %3 = and <4 x i1> %1, %extract.i
   1672   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1673   %5 = bitcast <8 x i1> %4 to i8
   1674   ret i8 %5
   1675 }
   1676 
   1677 define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
   1678 ; CHECK-LABEL: test_mm256_testn_epi32_mask:
   1679 ; CHECK:       # %bb.0: # %entry
   1680 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm1, %k0
   1681 ; CHECK-NEXT:    kmovw %k0, %eax
   1682 ; CHECK-NEXT:    movzbl %al, %eax
   1683 ; CHECK-NEXT:    vzeroupper
   1684 ; CHECK-NEXT:    ret{{[l|q]}}
   1685 entry:
   1686   %and.i.i = and <4 x i64> %__B, %__A
   1687   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
   1688   %1 = icmp eq <8 x i32> %0, zeroinitializer
   1689   %2 = bitcast <8 x i1> %1 to i8
   1690   ret i8 %2
   1691 }
   1692 
   1693 define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   1694 ; X86-LABEL: test_mm256_mask_testn_epi32_mask:
   1695 ; X86:       # %bb.0: # %entry
   1696 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1697 ; X86-NEXT:    kmovw %eax, %k1
   1698 ; X86-NEXT:    vptestnmd %ymm0, %ymm1, %k0 {%k1}
   1699 ; X86-NEXT:    kmovw %k0, %eax
   1700 ; X86-NEXT:    movzbl %al, %eax
   1701 ; X86-NEXT:    vzeroupper
   1702 ; X86-NEXT:    retl
   1703 ;
   1704 ; X64-LABEL: test_mm256_mask_testn_epi32_mask:
   1705 ; X64:       # %bb.0: # %entry
   1706 ; X64-NEXT:    kmovw %edi, %k1
   1707 ; X64-NEXT:    vptestnmd %ymm0, %ymm1, %k0 {%k1}
   1708 ; X64-NEXT:    kmovw %k0, %eax
   1709 ; X64-NEXT:    movzbl %al, %eax
   1710 ; X64-NEXT:    vzeroupper
   1711 ; X64-NEXT:    retq
   1712 entry:
   1713   %and.i.i = and <4 x i64> %__B, %__A
   1714   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
   1715   %1 = icmp eq <8 x i32> %0, zeroinitializer
   1716   %2 = bitcast i8 %__U to <8 x i1>
   1717   %3 = and <8 x i1> %1, %2
   1718   %4 = bitcast <8 x i1> %3 to i8
   1719   ret i8 %4
   1720 }
   1721 
   1722 define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
   1723 ; CHECK-LABEL: test_mm_testn_epi64_mask:
   1724 ; CHECK:       # %bb.0: # %entry
   1725 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm1, %k0
   1726 ; CHECK-NEXT:    kmovw %k0, %eax
   1727 ; CHECK-NEXT:    movzbl %al, %eax
   1728 ; CHECK-NEXT:    ret{{[l|q]}}
   1729 entry:
   1730   %and.i.i = and <2 x i64> %__B, %__A
   1731   %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
   1732   %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   1733   %2 = bitcast <8 x i1> %1 to i8
   1734   ret i8 %2
   1735 }
   1736 
   1737 define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   1738 ; X86-LABEL: test_mm_mask_testn_epi64_mask:
   1739 ; X86:       # %bb.0: # %entry
   1740 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1741 ; X86-NEXT:    kmovw %eax, %k1
   1742 ; X86-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
   1743 ; X86-NEXT:    kmovw %k0, %eax
   1744 ; X86-NEXT:    movzbl %al, %eax
   1745 ; X86-NEXT:    retl
   1746 ;
   1747 ; X64-LABEL: test_mm_mask_testn_epi64_mask:
   1748 ; X64:       # %bb.0: # %entry
   1749 ; X64-NEXT:    kmovw %edi, %k1
   1750 ; X64-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
   1751 ; X64-NEXT:    kmovw %k0, %eax
   1752 ; X64-NEXT:    movzbl %al, %eax
   1753 ; X64-NEXT:    retq
   1754 entry:
   1755   %and.i.i = and <2 x i64> %__B, %__A
   1756   %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
   1757   %1 = bitcast i8 %__U to <8 x i1>
   1758   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   1759   %2 = and <2 x i1> %0, %extract.i
   1760   %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   1761   %4 = bitcast <8 x i1> %3 to i8
   1762   ret i8 %4
   1763 }
   1764 
   1765 define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
   1766 ; CHECK-LABEL: test_mm256_testn_epi64_mask:
   1767 ; CHECK:       # %bb.0: # %entry
   1768 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm1, %k0
   1769 ; CHECK-NEXT:    kmovw %k0, %eax
   1770 ; CHECK-NEXT:    movzbl %al, %eax
   1771 ; CHECK-NEXT:    vzeroupper
   1772 ; CHECK-NEXT:    ret{{[l|q]}}
   1773 entry:
   1774   %and.i.i = and <4 x i64> %__B, %__A
   1775   %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
   1776   %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1777   %2 = bitcast <8 x i1> %1 to i8
   1778   ret i8 %2
   1779 }
   1780 
   1781 define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   1782 ; X86-LABEL: test_mm256_mask_testn_epi64_mask:
   1783 ; X86:       # %bb.0: # %entry
   1784 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1785 ; X86-NEXT:    kmovw %eax, %k1
   1786 ; X86-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
   1787 ; X86-NEXT:    kmovw %k0, %eax
   1788 ; X86-NEXT:    movzbl %al, %eax
   1789 ; X86-NEXT:    vzeroupper
   1790 ; X86-NEXT:    retl
   1791 ;
   1792 ; X64-LABEL: test_mm256_mask_testn_epi64_mask:
   1793 ; X64:       # %bb.0: # %entry
   1794 ; X64-NEXT:    kmovw %edi, %k1
   1795 ; X64-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
   1796 ; X64-NEXT:    kmovw %k0, %eax
   1797 ; X64-NEXT:    movzbl %al, %eax
   1798 ; X64-NEXT:    vzeroupper
   1799 ; X64-NEXT:    retq
   1800 entry:
   1801   %and.i.i = and <4 x i64> %__B, %__A
   1802   %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
   1803   %1 = bitcast i8 %__U to <8 x i1>
   1804   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1805   %2 = and <4 x i1> %0, %extract.i
   1806   %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1807   %4 = bitcast <8 x i1> %3 to i8
   1808   ret i8 %4
   1809 }
   1810 
   1811 define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M)  {
   1812 ; X86-LABEL: test_mm_mask_set1_epi32:
   1813 ; X86:       # %bb.0: # %entry
   1814 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1815 ; X86-NEXT:    kmovw %eax, %k1
   1816 ; X86-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1}
   1817 ; X86-NEXT:    retl
   1818 ;
   1819 ; X64-LABEL: test_mm_mask_set1_epi32:
   1820 ; X64:       # %bb.0: # %entry
   1821 ; X64-NEXT:    kmovw %edi, %k1
   1822 ; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1}
   1823 ; X64-NEXT:    retq
   1824 entry:
   1825   %0 = bitcast <2 x i64> %__O to <4 x i32>
   1826   %1 = bitcast i8 %__M to <8 x i1>
   1827   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1828   %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0
   1829   %3 = bitcast <4 x i32> %2 to <2 x i64>
   1830   ret <2 x i64> %3
   1831 }
   1832 
   1833 define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) {
   1834 ; X86-LABEL: test_mm_maskz_set1_epi32:
   1835 ; X86:       # %bb.0: # %entry
   1836 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1837 ; X86-NEXT:    kmovw %eax, %k1
   1838 ; X86-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z}
   1839 ; X86-NEXT:    retl
   1840 ;
   1841 ; X64-LABEL: test_mm_maskz_set1_epi32:
   1842 ; X64:       # %bb.0: # %entry
   1843 ; X64-NEXT:    kmovw %edi, %k1
   1844 ; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
   1845 ; X64-NEXT:    retq
   1846 entry:
   1847   %0 = bitcast i8 %__M to <8 x i1>
   1848   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1849   %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer
   1850   %2 = bitcast <4 x i32> %1 to <2 x i64>
   1851   ret <2 x i64> %2
   1852 }
   1853 
   1854 define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M)  {
   1855 ; X86-LABEL: test_mm256_mask_set1_epi32:
   1856 ; X86:       # %bb.0: # %entry
   1857 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1858 ; X86-NEXT:    kmovw %eax, %k1
   1859 ; X86-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1}
   1860 ; X86-NEXT:    retl
   1861 ;
   1862 ; X64-LABEL: test_mm256_mask_set1_epi32:
   1863 ; X64:       # %bb.0: # %entry
   1864 ; X64-NEXT:    kmovw %edi, %k1
   1865 ; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1}
   1866 ; X64-NEXT:    retq
   1867 entry:
   1868   %0 = bitcast <4 x i64> %__O to <8 x i32>
   1869   %1 = bitcast i8 %__M to <8 x i1>
   1870   %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0
   1871   %3 = bitcast <8 x i32> %2 to <4 x i64>
   1872   ret <4 x i64> %3
   1873 }
   1874 
   1875 define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M)  {
   1876 ; X86-LABEL: test_mm256_maskz_set1_epi32:
   1877 ; X86:       # %bb.0: # %entry
   1878 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1879 ; X86-NEXT:    kmovw %eax, %k1
   1880 ; X86-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z}
   1881 ; X86-NEXT:    retl
   1882 ;
   1883 ; X64-LABEL: test_mm256_maskz_set1_epi32:
   1884 ; X64:       # %bb.0: # %entry
   1885 ; X64-NEXT:    kmovw %edi, %k1
   1886 ; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
   1887 ; X64-NEXT:    retq
   1888 entry:
   1889   %0 = bitcast i8 %__M to <8 x i1>
   1890   %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer
   1891   %2 = bitcast <8 x i32> %1 to <4 x i64>
   1892   ret <4 x i64> %2
   1893 }
   1894 
   1895 define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A)  {
   1896 ; X86-LABEL: test_mm_mask_set1_epi64:
   1897 ; X86:       # %bb.0: # %entry
   1898 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1899 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1900 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1901 ; X86-NEXT:    kmovw %eax, %k1
   1902 ; X86-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
   1903 ; X86-NEXT:    retl
   1904 ;
   1905 ; X64-LABEL: test_mm_mask_set1_epi64:
   1906 ; X64:       # %bb.0: # %entry
   1907 ; X64-NEXT:    kmovw %edi, %k1
   1908 ; X64-NEXT:    vpbroadcastq %rsi, %xmm0 {%k1}
   1909 ; X64-NEXT:    retq
   1910 entry:
   1911   %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
   1912   %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
   1913   %0 = bitcast i8 %__M to <8 x i1>
   1914   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   1915   %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O
   1916   ret <2 x i64> %1
   1917 }
   1918 
   1919 define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
   1920 ; X86-LABEL: test_mm_maskz_set1_epi64:
   1921 ; X86:       # %bb.0: # %entry
   1922 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1923 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1924 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
   1925 ; X86-NEXT:    kmovw %eax, %k1
   1926 ; X86-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
   1927 ; X86-NEXT:    retl
   1928 ;
   1929 ; X64-LABEL: test_mm_maskz_set1_epi64:
   1930 ; X64:       # %bb.0: # %entry
   1931 ; X64-NEXT:    kmovw %edi, %k1
   1932 ; X64-NEXT:    vpbroadcastq %rsi, %xmm0 {%k1} {z}
   1933 ; X64-NEXT:    retq
   1934 entry:
   1935   %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
   1936   %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
   1937   %0 = bitcast i8 %__M to <8 x i1>
   1938   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   1939   %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer
   1940   ret <2 x i64> %1
   1941 }
   1942 
   1943 
   1944 define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) {
   1945 ; X86-LABEL: test_mm256_mask_set1_epi64:
   1946 ; X86:       # %bb.0: # %entry
   1947 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1948 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1949 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1950 ; X86-NEXT:    kmovw %eax, %k1
   1951 ; X86-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
   1952 ; X86-NEXT:    retl
   1953 ;
   1954 ; X64-LABEL: test_mm256_mask_set1_epi64:
   1955 ; X64:       # %bb.0: # %entry
   1956 ; X64-NEXT:    kmovw %edi, %k1
   1957 ; X64-NEXT:    vpbroadcastq %rsi, %ymm0 {%k1}
   1958 ; X64-NEXT:    retq
   1959 entry:
   1960   %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
   1961   %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
   1962   %0 = bitcast i8 %__M to <8 x i1>
   1963   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1964   %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O
   1965   ret <4 x i64> %1
   1966 }
   1967 
   1968 define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
   1969 ; X86-LABEL: test_mm256_maskz_set1_epi64:
   1970 ; X86:       # %bb.0: # %entry
   1971 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1972 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1973 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
   1974 ; X86-NEXT:    kmovw %eax, %k1
   1975 ; X86-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
   1976 ; X86-NEXT:    retl
   1977 ;
   1978 ; X64-LABEL: test_mm256_maskz_set1_epi64:
   1979 ; X64:       # %bb.0: # %entry
   1980 ; X64-NEXT:    kmovw %edi, %k1
   1981 ; X64-NEXT:    vpbroadcastq %rsi, %ymm0 {%k1} {z}
   1982 ; X64-NEXT:    retq
   1983 entry:
   1984   %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
   1985   %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
   1986   %0 = bitcast i8 %__M to <8 x i1>
   1987   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1988   %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer
   1989   ret <4 x i64> %1
   1990 }
   1991 
   1992 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
   1993 ; CHECK-LABEL: test_mm_broadcastd_epi32:
   1994 ; CHECK:       # %bb.0:
   1995 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
   1996 ; CHECK-NEXT:    ret{{[l|q]}}
   1997   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   1998   %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
   1999   %res1 = bitcast <4 x i32> %res0 to <2 x i64>
   2000   ret <2 x i64> %res1
   2001 }
   2002 
   2003 define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
   2004 ; X86-LABEL: test_mm_mask_broadcastd_epi32:
   2005 ; X86:       # %bb.0: # %entry
   2006 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2007 ; X86-NEXT:    kmovw %eax, %k1
   2008 ; X86-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
   2009 ; X86-NEXT:    retl
   2010 ;
   2011 ; X64-LABEL: test_mm_mask_broadcastd_epi32:
   2012 ; X64:       # %bb.0: # %entry
   2013 ; X64-NEXT:    kmovw %edi, %k1
   2014 ; X64-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
   2015 ; X64-NEXT:    retq
   2016 entry:
   2017   %0 = bitcast <2 x i64> %__A to <4 x i32>
   2018   %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
   2019   %1 = bitcast <2 x i64> %__O to <4 x i32>
   2020   %2 = bitcast i8 %__M to <8 x i1>
   2021   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2022   %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1
   2023   %4 = bitcast <4 x i32> %3 to <2 x i64>
   2024   ret <2 x i64> %4
   2025 }
   2026 
   2027 define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) {
   2028 ; X86-LABEL: test_mm_maskz_broadcastd_epi32:
   2029 ; X86:       # %bb.0: # %entry
   2030 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2031 ; X86-NEXT:    kmovw %eax, %k1
   2032 ; X86-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
   2033 ; X86-NEXT:    retl
   2034 ;
   2035 ; X64-LABEL: test_mm_maskz_broadcastd_epi32:
   2036 ; X64:       # %bb.0: # %entry
   2037 ; X64-NEXT:    kmovw %edi, %k1
   2038 ; X64-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
   2039 ; X64-NEXT:    retq
   2040 entry:
   2041   %0 = bitcast <2 x i64> %__A to <4 x i32>
   2042   %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
   2043   %1 = bitcast i8 %__M to <8 x i1>
   2044   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2045   %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer
   2046   %3 = bitcast <4 x i32> %2 to <2 x i64>
   2047   ret <2 x i64> %3
   2048 }
   2049 
   2050 define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
   2051 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
   2052 ; CHECK:       # %bb.0:
   2053 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
   2054 ; CHECK-NEXT:    ret{{[l|q]}}
   2055   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   2056   %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
   2057   %res1 = bitcast <8 x i32> %res0 to <4 x i64>
   2058   ret <4 x i64> %res1
   2059 }
   2060 
   2061 define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
   2062 ; X86-LABEL: test_mm256_mask_broadcastd_epi32:
   2063 ; X86:       # %bb.0:
   2064 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2065 ; X86-NEXT:    kmovw %eax, %k1
   2066 ; X86-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
   2067 ; X86-NEXT:    retl
   2068 ;
   2069 ; X64-LABEL: test_mm256_mask_broadcastd_epi32:
   2070 ; X64:       # %bb.0:
   2071 ; X64-NEXT:    kmovw %edi, %k1
   2072 ; X64-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
   2073 ; X64-NEXT:    retq
   2074   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2075   %arg1 = bitcast i8 %a1 to <8 x i1>
   2076   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   2077   %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
   2078   %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
   2079   %res2 = bitcast <8 x i32> %res1 to <4 x i64>
   2080   ret <4 x i64> %res2
   2081 }
   2082 
   2083 define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
   2084 ; X86-LABEL: test_mm256_maskz_broadcastd_epi32:
   2085 ; X86:       # %bb.0:
   2086 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2087 ; X86-NEXT:    kmovw %eax, %k1
   2088 ; X86-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
   2089 ; X86-NEXT:    retl
   2090 ;
   2091 ; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
   2092 ; X64:       # %bb.0:
   2093 ; X64-NEXT:    kmovw %edi, %k1
   2094 ; X64-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
   2095 ; X64-NEXT:    retq
   2096   %arg0 = bitcast i8 %a0 to <8 x i1>
   2097   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   2098   %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
   2099   %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
   2100   %res2 = bitcast <8 x i32> %res1 to <4 x i64>
   2101   ret <4 x i64> %res2
   2102 }
   2103 
   2104 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
   2105 ; CHECK-LABEL: test_mm_broadcastq_epi64:
   2106 ; CHECK:       # %bb.0:
   2107 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
   2108 ; CHECK-NEXT:    ret{{[l|q]}}
   2109   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
   2110   ret <2 x i64> %res
   2111 }
   2112 
   2113 define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
   2114 ; X86-LABEL: test_mm_mask_broadcastq_epi64:
   2115 ; X86:       # %bb.0: # %entry
   2116 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2117 ; X86-NEXT:    kmovw %eax, %k1
   2118 ; X86-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
   2119 ; X86-NEXT:    retl
   2120 ;
   2121 ; X64-LABEL: test_mm_mask_broadcastq_epi64:
   2122 ; X64:       # %bb.0: # %entry
   2123 ; X64-NEXT:    kmovw %edi, %k1
   2124 ; X64-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
   2125 ; X64-NEXT:    retq
   2126 entry:
   2127   %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
   2128   %0 = bitcast i8 %__M to <8 x i1>
   2129   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   2130   %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O
   2131   ret <2 x i64> %1
   2132 }
   2133 
   2134 define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
   2135 ; X86-LABEL: test_mm_maskz_broadcastq_epi64:
   2136 ; X86:       # %bb.0: # %entry
   2137 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2138 ; X86-NEXT:    kmovw %eax, %k1
   2139 ; X86-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
   2140 ; X86-NEXT:    retl
   2141 ;
   2142 ; X64-LABEL: test_mm_maskz_broadcastq_epi64:
   2143 ; X64:       # %bb.0: # %entry
   2144 ; X64-NEXT:    kmovw %edi, %k1
   2145 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
   2146 ; X64-NEXT:    retq
   2147 entry:
   2148   %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
   2149   %0 = bitcast i8 %__M to <8 x i1>
   2150   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   2151   %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer
   2152   ret <2 x i64> %1
   2153 }
   2154 
   2155 define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
   2156 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
   2157 ; CHECK:       # %bb.0:
   2158 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
   2159 ; CHECK-NEXT:    ret{{[l|q]}}
   2160   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
   2161   ret <4 x i64> %res
   2162 }
   2163 
   2164 define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
   2165 ; X86-LABEL: test_mm256_mask_broadcastq_epi64:
   2166 ; X86:       # %bb.0: # %entry
   2167 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2168 ; X86-NEXT:    kmovw %eax, %k1
   2169 ; X86-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
   2170 ; X86-NEXT:    retl
   2171 ;
   2172 ; X64-LABEL: test_mm256_mask_broadcastq_epi64:
   2173 ; X64:       # %bb.0: # %entry
   2174 ; X64-NEXT:    kmovw %edi, %k1
   2175 ; X64-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
   2176 ; X64-NEXT:    retq
   2177 entry:
   2178   %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
   2179   %0 = bitcast i8 %__M to <8 x i1>
   2180   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2181   %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O
   2182   ret <4 x i64> %1
   2183 }
   2184 
   2185 define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
   2186 ; X86-LABEL: test_mm256_maskz_broadcastq_epi64:
   2187 ; X86:       # %bb.0: # %entry
   2188 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2189 ; X86-NEXT:    kmovw %eax, %k1
   2190 ; X86-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
   2191 ; X86-NEXT:    retl
   2192 ;
   2193 ; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
   2194 ; X64:       # %bb.0: # %entry
   2195 ; X64-NEXT:    kmovw %edi, %k1
   2196 ; X64-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
   2197 ; X64-NEXT:    retq
   2198 entry:
   2199   %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
   2200   %0 = bitcast i8 %__M to <8 x i1>
   2201   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2202   %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer
   2203   ret <4 x i64> %1
   2204 }
   2205 
   2206 define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
   2207 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
   2208 ; CHECK:       # %bb.0:
   2209 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
   2210 ; CHECK-NEXT:    ret{{[l|q]}}
   2211   %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
   2212   ret <4 x double> %res
   2213 }
   2214 
   2215 define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) {
   2216 ; X86-LABEL: test_mm256_mask_broadcastsd_pd:
   2217 ; X86:       # %bb.0: # %entry
   2218 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2219 ; X86-NEXT:    kmovw %eax, %k1
   2220 ; X86-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
   2221 ; X86-NEXT:    retl
   2222 ;
   2223 ; X64-LABEL: test_mm256_mask_broadcastsd_pd:
   2224 ; X64:       # %bb.0: # %entry
   2225 ; X64-NEXT:    kmovw %edi, %k1
   2226 ; X64-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
   2227 ; X64-NEXT:    retq
   2228 entry:
   2229   %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
   2230   %0 = bitcast i8 %__M to <8 x i1>
   2231   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2232   %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O
   2233   ret <4 x double> %1
   2234 }
   2235 
   2236 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) {
   2237 ; X86-LABEL: test_mm256_maskz_broadcastsd_pd:
   2238 ; X86:       # %bb.0: # %entry
   2239 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2240 ; X86-NEXT:    kmovw %eax, %k1
   2241 ; X86-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
   2242 ; X86-NEXT:    retl
   2243 ;
   2244 ; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
   2245 ; X64:       # %bb.0: # %entry
   2246 ; X64-NEXT:    kmovw %edi, %k1
   2247 ; X64-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
   2248 ; X64-NEXT:    retq
   2249 entry:
   2250   %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
   2251   %0 = bitcast i8 %__M to <8 x i1>
   2252   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2253   %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
   2254   ret <4 x double> %1
   2255 }
   2256 
   2257 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
   2258 ; CHECK-LABEL: test_mm_broadcastss_ps:
   2259 ; CHECK:       # %bb.0:
   2260 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
   2261 ; CHECK-NEXT:    ret{{[l|q]}}
   2262   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
   2263   ret <4 x float> %res
   2264 }
   2265 
   2266 define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) {
   2267 ; X86-LABEL: test_mm_mask_broadcastss_ps:
   2268 ; X86:       # %bb.0: # %entry
   2269 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2270 ; X86-NEXT:    kmovw %eax, %k1
   2271 ; X86-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
   2272 ; X86-NEXT:    retl
   2273 ;
   2274 ; X64-LABEL: test_mm_mask_broadcastss_ps:
   2275 ; X64:       # %bb.0: # %entry
   2276 ; X64-NEXT:    kmovw %edi, %k1
   2277 ; X64-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
   2278 ; X64-NEXT:    retq
   2279 entry:
   2280   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
   2281   %0 = bitcast i8 %__M to <8 x i1>
   2282   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2283   %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O
   2284   ret <4 x float> %1
   2285 }
   2286 
   2287 define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) {
   2288 ; X86-LABEL: test_mm_maskz_broadcastss_ps:
   2289 ; X86:       # %bb.0: # %entry
   2290 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2291 ; X86-NEXT:    kmovw %eax, %k1
   2292 ; X86-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
   2293 ; X86-NEXT:    retl
   2294 ;
   2295 ; X64-LABEL: test_mm_maskz_broadcastss_ps:
   2296 ; X64:       # %bb.0: # %entry
   2297 ; X64-NEXT:    kmovw %edi, %k1
   2298 ; X64-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
   2299 ; X64-NEXT:    retq
   2300 entry:
   2301   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
   2302   %0 = bitcast i8 %__M to <8 x i1>
   2303   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2304   %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
   2305   ret <4 x float> %1
   2306 }
   2307 
   2308 define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
   2309 ; CHECK-LABEL: test_mm256_broadcastss_ps:
   2310 ; CHECK:       # %bb.0:
   2311 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
   2312 ; CHECK-NEXT:    ret{{[l|q]}}
   2313   %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
   2314   ret <8 x float> %res
   2315 }
   2316 
   2317 define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
   2318 ; X86-LABEL: test_mm256_mask_broadcastss_ps:
   2319 ; X86:       # %bb.0:
   2320 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2321 ; X86-NEXT:    kmovw %eax, %k1
   2322 ; X86-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
   2323 ; X86-NEXT:    retl
   2324 ;
   2325 ; X64-LABEL: test_mm256_mask_broadcastss_ps:
   2326 ; X64:       # %bb.0:
   2327 ; X64-NEXT:    kmovw %edi, %k1
   2328 ; X64-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
   2329 ; X64-NEXT:    retq
   2330   %arg1 = bitcast i8 %a1 to <8 x i1>
   2331   %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
   2332   %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
   2333   ret <8 x float> %res1
   2334 }
   2335 
   2336 define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
   2337 ; X86-LABEL: test_mm256_maskz_broadcastss_ps:
   2338 ; X86:       # %bb.0:
   2339 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2340 ; X86-NEXT:    kmovw %eax, %k1
   2341 ; X86-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
   2342 ; X86-NEXT:    retl
   2343 ;
   2344 ; X64-LABEL: test_mm256_maskz_broadcastss_ps:
   2345 ; X64:       # %bb.0:
   2346 ; X64-NEXT:    kmovw %edi, %k1
   2347 ; X64-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
   2348 ; X64-NEXT:    retq
   2349   %arg0 = bitcast i8 %a0 to <8 x i1>
   2350   %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
   2351   %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
   2352   ret <8 x float> %res1
   2353 }
   2354 
   2355 define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
   2356 ; CHECK-LABEL: test_mm_movddup_pd:
   2357 ; CHECK:       # %bb.0:
   2358 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   2359 ; CHECK-NEXT:    ret{{[l|q]}}
   2360   %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
   2361   ret <2 x double> %res
   2362 }
   2363 
   2364 define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
   2365 ; X86-LABEL: test_mm_mask_movedup_pd:
   2366 ; X86:       # %bb.0: # %entry
   2367 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2368 ; X86-NEXT:    kmovw %eax, %k1
   2369 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
   2370 ; X86-NEXT:    retl
   2371 ;
   2372 ; X64-LABEL: test_mm_mask_movedup_pd:
   2373 ; X64:       # %bb.0: # %entry
   2374 ; X64-NEXT:    kmovw %edi, %k1
   2375 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
   2376 ; X64-NEXT:    retq
   2377 entry:
   2378   %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
   2379   %0 = bitcast i8 %__U to <8 x i1>
   2380   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   2381   %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W
   2382   ret <2 x double> %1
   2383 }
   2384 
   2385 define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) {
   2386 ; X86-LABEL: test_mm_maskz_movedup_pd:
   2387 ; X86:       # %bb.0: # %entry
   2388 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2389 ; X86-NEXT:    kmovw %eax, %k1
   2390 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
   2391 ; X86-NEXT:    retl
   2392 ;
   2393 ; X64-LABEL: test_mm_maskz_movedup_pd:
   2394 ; X64:       # %bb.0: # %entry
   2395 ; X64-NEXT:    kmovw %edi, %k1
   2396 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
   2397 ; X64-NEXT:    retq
   2398 entry:
   2399   %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
   2400   %0 = bitcast i8 %__U to <8 x i1>
   2401   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   2402   %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer
   2403   ret <2 x double> %1
   2404 }
   2405 
   2406 define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
   2407 ; CHECK-LABEL: test_mm256_movddup_pd:
   2408 ; CHECK:       # %bb.0:
   2409 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
   2410 ; CHECK-NEXT:    ret{{[l|q]}}
   2411   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   2412   ret <4 x double> %res
   2413 }
   2414 
   2415 define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
   2416 ; X86-LABEL: test_mm256_mask_movedup_pd:
   2417 ; X86:       # %bb.0: # %entry
   2418 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2419 ; X86-NEXT:    kmovw %eax, %k1
   2420 ; X86-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
   2421 ; X86-NEXT:    retl
   2422 ;
   2423 ; X64-LABEL: test_mm256_mask_movedup_pd:
   2424 ; X64:       # %bb.0: # %entry
   2425 ; X64-NEXT:    kmovw %edi, %k1
   2426 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
   2427 ; X64-NEXT:    retq
   2428 entry:
   2429   %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   2430   %0 = bitcast i8 %__U to <8 x i1>
   2431   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2432   %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W
   2433   ret <4 x double> %1
   2434 }
   2435 
   2436 define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) {
   2437 ; X86-LABEL: test_mm256_maskz_movedup_pd:
   2438 ; X86:       # %bb.0: # %entry
   2439 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2440 ; X86-NEXT:    kmovw %eax, %k1
   2441 ; X86-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
   2442 ; X86-NEXT:    retl
   2443 ;
   2444 ; X64-LABEL: test_mm256_maskz_movedup_pd:
   2445 ; X64:       # %bb.0: # %entry
   2446 ; X64-NEXT:    kmovw %edi, %k1
   2447 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
   2448 ; X64-NEXT:    retq
   2449 entry:
   2450   %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   2451   %0 = bitcast i8 %__U to <8 x i1>
   2452   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2453   %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
   2454   ret <4 x double> %1
   2455 }
   2456 
   2457 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
   2458 ; CHECK-LABEL: test_mm_movehdup_ps:
   2459 ; CHECK:       # %bb.0:
   2460 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
   2461 ; CHECK-NEXT:    ret{{[l|q]}}
   2462   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   2463   ret <4 x float> %res
   2464 }
   2465 
   2466 define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
   2467 ; X86-LABEL: test_mm_mask_movehdup_ps:
   2468 ; X86:       # %bb.0: # %entry
   2469 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2470 ; X86-NEXT:    kmovw %eax, %k1
   2471 ; X86-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
   2472 ; X86-NEXT:    retl
   2473 ;
   2474 ; X64-LABEL: test_mm_mask_movehdup_ps:
   2475 ; X64:       # %bb.0: # %entry
   2476 ; X64-NEXT:    kmovw %edi, %k1
   2477 ; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
   2478 ; X64-NEXT:    retq
   2479 entry:
   2480   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   2481   %0 = bitcast i8 %__U to <8 x i1>
   2482   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2483   %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
   2484   ret <4 x float> %1
   2485 }
   2486 
   2487 define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) {
   2488 ; X86-LABEL: test_mm_maskz_movehdup_ps:
   2489 ; X86:       # %bb.0: # %entry
   2490 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2491 ; X86-NEXT:    kmovw %eax, %k1
   2492 ; X86-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
   2493 ; X86-NEXT:    retl
   2494 ;
   2495 ; X64-LABEL: test_mm_maskz_movehdup_ps:
   2496 ; X64:       # %bb.0: # %entry
   2497 ; X64-NEXT:    kmovw %edi, %k1
   2498 ; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
   2499 ; X64-NEXT:    retq
   2500 entry:
   2501   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   2502   %0 = bitcast i8 %__U to <8 x i1>
   2503   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2504   %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
   2505   ret <4 x float> %1
   2506 }
   2507 
   2508 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
   2509 ; CHECK-LABEL: test_mm256_movehdup_ps:
   2510 ; CHECK:       # %bb.0:
   2511 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
   2512 ; CHECK-NEXT:    ret{{[l|q]}}
   2513   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   2514   ret <8 x float> %res
   2515 }
   2516 
   2517 define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
   2518 ; X86-LABEL: test_mm256_mask_movehdup_ps:
   2519 ; X86:       # %bb.0:
   2520 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2521 ; X86-NEXT:    kmovw %eax, %k1
   2522 ; X86-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
   2523 ; X86-NEXT:    retl
   2524 ;
   2525 ; X64-LABEL: test_mm256_mask_movehdup_ps:
   2526 ; X64:       # %bb.0:
   2527 ; X64-NEXT:    kmovw %edi, %k1
   2528 ; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
   2529 ; X64-NEXT:    retq
   2530   %arg1 = bitcast i8 %a1 to <8 x i1>
   2531   %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   2532   %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
   2533   ret <8 x float> %res1
   2534 }
   2535 
   2536 define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
   2537 ; X86-LABEL: test_mm256_maskz_movehdup_ps:
   2538 ; X86:       # %bb.0:
   2539 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2540 ; X86-NEXT:    kmovw %eax, %k1
   2541 ; X86-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
   2542 ; X86-NEXT:    retl
   2543 ;
   2544 ; X64-LABEL: test_mm256_maskz_movehdup_ps:
   2545 ; X64:       # %bb.0:
   2546 ; X64-NEXT:    kmovw %edi, %k1
   2547 ; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
   2548 ; X64-NEXT:    retq
   2549   %arg0 = bitcast i8 %a0 to <8 x i1>
   2550   %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   2551   %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
   2552   ret <8 x float> %res1
   2553 }
   2554 
   2555 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
   2556 ; CHECK-LABEL: test_mm_moveldup_ps:
   2557 ; CHECK:       # %bb.0:
   2558 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
   2559 ; CHECK-NEXT:    ret{{[l|q]}}
   2560   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   2561   ret <4 x float> %res
   2562 }
   2563 
   2564 define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
   2565 ; X86-LABEL: test_mm_mask_moveldup_ps:
   2566 ; X86:       # %bb.0: # %entry
   2567 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2568 ; X86-NEXT:    kmovw %eax, %k1
   2569 ; X86-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
   2570 ; X86-NEXT:    retl
   2571 ;
   2572 ; X64-LABEL: test_mm_mask_moveldup_ps:
   2573 ; X64:       # %bb.0: # %entry
   2574 ; X64-NEXT:    kmovw %edi, %k1
   2575 ; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
   2576 ; X64-NEXT:    retq
   2577 entry:
   2578   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   2579   %0 = bitcast i8 %__U to <8 x i1>
   2580   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2581   %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
   2582   ret <4 x float> %1
   2583 }
   2584 
   2585 define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) {
   2586 ; X86-LABEL: test_mm_maskz_moveldup_ps:
   2587 ; X86:       # %bb.0: # %entry
   2588 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2589 ; X86-NEXT:    kmovw %eax, %k1
   2590 ; X86-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
   2591 ; X86-NEXT:    retl
   2592 ;
   2593 ; X64-LABEL: test_mm_maskz_moveldup_ps:
   2594 ; X64:       # %bb.0: # %entry
   2595 ; X64-NEXT:    kmovw %edi, %k1
   2596 ; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
   2597 ; X64-NEXT:    retq
   2598 entry:
   2599   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   2600   %0 = bitcast i8 %__U to <8 x i1>
   2601   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2602   %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
   2603   ret <4 x float> %1
   2604 }
   2605 
   2606 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
   2607 ; CHECK-LABEL: test_mm256_moveldup_ps:
   2608 ; CHECK:       # %bb.0:
   2609 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
   2610 ; CHECK-NEXT:    ret{{[l|q]}}
   2611   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   2612   ret <8 x float> %res
   2613 }
   2614 
   2615 define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
   2616 ; X86-LABEL: test_mm256_mask_moveldup_ps:
   2617 ; X86:       # %bb.0:
   2618 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2619 ; X86-NEXT:    kmovw %eax, %k1
   2620 ; X86-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
   2621 ; X86-NEXT:    retl
   2622 ;
   2623 ; X64-LABEL: test_mm256_mask_moveldup_ps:
   2624 ; X64:       # %bb.0:
   2625 ; X64-NEXT:    kmovw %edi, %k1
   2626 ; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
   2627 ; X64-NEXT:    retq
   2628   %arg1 = bitcast i8 %a1 to <8 x i1>
   2629   %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   2630   %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
   2631   ret <8 x float> %res1
   2632 }
   2633 
   2634 define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
   2635 ; X86-LABEL: test_mm256_maskz_moveldup_ps:
   2636 ; X86:       # %bb.0:
   2637 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2638 ; X86-NEXT:    kmovw %eax, %k1
   2639 ; X86-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
   2640 ; X86-NEXT:    retl
   2641 ;
   2642 ; X64-LABEL: test_mm256_maskz_moveldup_ps:
   2643 ; X64:       # %bb.0:
   2644 ; X64-NEXT:    kmovw %edi, %k1
   2645 ; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
   2646 ; X64-NEXT:    retq
   2647   %arg0 = bitcast i8 %a0 to <8 x i1>
   2648   %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   2649   %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
   2650   ret <8 x float> %res1
   2651 }
   2652 
   2653 define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
   2654 ; CHECK-LABEL: test_mm256_permutex_epi64:
   2655 ; CHECK:       # %bb.0:
   2656 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
   2657 ; CHECK-NEXT:    ret{{[l|q]}}
   2658   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   2659   ret <4 x i64> %res
   2660 }
   2661 
   2662 define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) {
   2663 ; X86-LABEL: test_mm256_mask_permutex_epi64:
   2664 ; X86:       # %bb.0: # %entry
   2665 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2666 ; X86-NEXT:    kmovw %eax, %k1
   2667 ; X86-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
   2668 ; X86-NEXT:    retl
   2669 ;
   2670 ; X64-LABEL: test_mm256_mask_permutex_epi64:
   2671 ; X64:       # %bb.0: # %entry
   2672 ; X64-NEXT:    kmovw %edi, %k1
   2673 ; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
   2674 ; X64-NEXT:    retq
   2675 entry:
   2676   %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   2677   %0 = bitcast i8 %__M to <8 x i1>
   2678   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2679   %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W
   2680   ret <4 x i64> %1
   2681 }
   2682 
   2683 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) {
   2684 ; X86-LABEL: test_mm256_maskz_permutex_epi64:
   2685 ; X86:       # %bb.0: # %entry
   2686 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2687 ; X86-NEXT:    kmovw %eax, %k1
   2688 ; X86-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
   2689 ; X86-NEXT:    retl
   2690 ;
   2691 ; X64-LABEL: test_mm256_maskz_permutex_epi64:
   2692 ; X64:       # %bb.0: # %entry
   2693 ; X64-NEXT:    kmovw %edi, %k1
   2694 ; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
   2695 ; X64-NEXT:    retq
   2696 entry:
   2697   %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   2698   %0 = bitcast i8 %__M to <8 x i1>
   2699   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2700   %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer
   2701   ret <4 x i64> %1
   2702 }
   2703 
   2704 define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
   2705 ; CHECK-LABEL: test_mm256_permutex_pd:
   2706 ; CHECK:       # %bb.0:
   2707 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
   2708 ; CHECK-NEXT:    ret{{[l|q]}}
   2709   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   2710   ret <4 x double> %res
   2711 }
   2712 
   2713 define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) {
   2714 ; X86-LABEL: test_mm256_mask_permutex_pd:
   2715 ; X86:       # %bb.0: # %entry
   2716 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2717 ; X86-NEXT:    kmovw %eax, %k1
   2718 ; X86-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
   2719 ; X86-NEXT:    retl
   2720 ;
   2721 ; X64-LABEL: test_mm256_mask_permutex_pd:
   2722 ; X64:       # %bb.0: # %entry
   2723 ; X64-NEXT:    kmovw %edi, %k1
   2724 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
   2725 ; X64-NEXT:    retq
   2726 entry:
   2727   %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   2728   %0 = bitcast i8 %__U to <8 x i1>
   2729   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2730   %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W
   2731   ret <4 x double> %1
   2732 }
   2733 
   2734 define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) {
   2735 ; X86-LABEL: test_mm256_maskz_permutex_pd:
   2736 ; X86:       # %bb.0: # %entry
   2737 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2738 ; X86-NEXT:    kmovw %eax, %k1
   2739 ; X86-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
   2740 ; X86-NEXT:    retl
   2741 ;
   2742 ; X64-LABEL: test_mm256_maskz_permutex_pd:
   2743 ; X64:       # %bb.0: # %entry
   2744 ; X64-NEXT:    kmovw %edi, %k1
   2745 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
   2746 ; X64-NEXT:    retq
   2747 entry:
   2748   %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   2749   %0 = bitcast i8 %__U to <8 x i1>
   2750   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2751   %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer
   2752   ret <4 x double> %1
   2753 }
   2754 
   2755 define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
   2756 ; CHECK-LABEL: test_mm_shuffle_pd:
   2757 ; CHECK:       # %bb.0:
   2758 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   2759 ; CHECK-NEXT:    ret{{[l|q]}}
   2760   %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
   2761   ret <2 x double> %res
   2762 }
   2763 
   2764 define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2765 ; X86-LABEL: test_mm_mask_shuffle_pd:
   2766 ; X86:       # %bb.0: # %entry
   2767 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2768 ; X86-NEXT:    kmovw %eax, %k1
   2769 ; X86-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
   2770 ; X86-NEXT:    retl
   2771 ;
   2772 ; X64-LABEL: test_mm_mask_shuffle_pd:
   2773 ; X64:       # %bb.0: # %entry
   2774 ; X64-NEXT:    kmovw %edi, %k1
   2775 ; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
   2776 ; X64-NEXT:    retq
   2777 entry:
   2778   %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
   2779   %0 = bitcast i8 %__U to <8 x i1>
   2780   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   2781   %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W
   2782   ret <2 x double> %1
   2783 }
   2784 
   2785 define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2786 ; X86-LABEL: test_mm_maskz_shuffle_pd:
   2787 ; X86:       # %bb.0: # %entry
   2788 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2789 ; X86-NEXT:    kmovw %eax, %k1
   2790 ; X86-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
   2791 ; X86-NEXT:    retl
   2792 ;
   2793 ; X64-LABEL: test_mm_maskz_shuffle_pd:
   2794 ; X64:       # %bb.0: # %entry
   2795 ; X64-NEXT:    kmovw %edi, %k1
   2796 ; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
   2797 ; X64-NEXT:    retq
   2798 entry:
   2799   %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
   2800   %0 = bitcast i8 %__U to <8 x i1>
   2801   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   2802   %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer
   2803   ret <2 x double> %1
   2804 }
   2805 
   2806 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
   2807 ; CHECK-LABEL: test_mm256_shuffle_pd:
   2808 ; CHECK:       # %bb.0:
   2809 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
   2810 ; CHECK-NEXT:    ret{{[l|q]}}
   2811   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
   2812   ret <4 x double> %res
   2813 }
   2814 
   2815 define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
   2816 ; X86-LABEL: test_mm256_mask_shuffle_pd:
   2817 ; X86:       # %bb.0: # %entry
   2818 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2819 ; X86-NEXT:    kmovw %eax, %k1
   2820 ; X86-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
   2821 ; X86-NEXT:    retl
   2822 ;
   2823 ; X64-LABEL: test_mm256_mask_shuffle_pd:
   2824 ; X64:       # %bb.0: # %entry
   2825 ; X64-NEXT:    kmovw %edi, %k1
   2826 ; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
   2827 ; X64-NEXT:    retq
   2828 entry:
   2829   %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
   2830   %0 = bitcast i8 %__U to <8 x i1>
   2831   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2832   %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W
   2833   ret <4 x double> %1
   2834 }
   2835 
   2836 define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
   2837 ; X86-LABEL: test_mm256_maskz_shuffle_pd:
   2838 ; X86:       # %bb.0: # %entry
   2839 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2840 ; X86-NEXT:    kmovw %eax, %k1
   2841 ; X86-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
   2842 ; X86-NEXT:    retl
   2843 ;
   2844 ; X64-LABEL: test_mm256_maskz_shuffle_pd:
   2845 ; X64:       # %bb.0: # %entry
   2846 ; X64-NEXT:    kmovw %edi, %k1
   2847 ; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
   2848 ; X64-NEXT:    retq
   2849 entry:
   2850   %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
   2851   %0 = bitcast i8 %__U to <8 x i1>
   2852   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2853   %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer
   2854   ret <4 x double> %1
   2855 }
   2856 
   2857 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
   2858 ; CHECK-LABEL: test_mm_shuffle_ps:
   2859 ; CHECK:       # %bb.0:
   2860 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
   2861 ; CHECK-NEXT:    ret{{[l|q]}}
   2862   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
   2863   ret <4 x float> %res
   2864 }
   2865 
   2866 define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2867 ; X86-LABEL: test_mm_mask_shuffle_ps:
   2868 ; X86:       # %bb.0: # %entry
   2869 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2870 ; X86-NEXT:    kmovw %eax, %k1
   2871 ; X86-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
   2872 ; X86-NEXT:    retl
   2873 ;
   2874 ; X64-LABEL: test_mm_mask_shuffle_ps:
   2875 ; X64:       # %bb.0: # %entry
   2876 ; X64-NEXT:    kmovw %edi, %k1
   2877 ; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
   2878 ; X64-NEXT:    retq
   2879 entry:
   2880   %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
   2881   %0 = bitcast i8 %__U to <8 x i1>
   2882   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2883   %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W
   2884   ret <4 x float> %1
   2885 }
   2886 
   2887 define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2888 ; X86-LABEL: test_mm_maskz_shuffle_ps:
   2889 ; X86:       # %bb.0: # %entry
   2890 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2891 ; X86-NEXT:    kmovw %eax, %k1
   2892 ; X86-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
   2893 ; X86-NEXT:    retl
   2894 ;
   2895 ; X64-LABEL: test_mm_maskz_shuffle_ps:
   2896 ; X64:       # %bb.0: # %entry
   2897 ; X64-NEXT:    kmovw %edi, %k1
   2898 ; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
   2899 ; X64-NEXT:    retq
   2900 entry:
   2901   %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
   2902   %0 = bitcast i8 %__U to <8 x i1>
   2903   %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2904   %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer
   2905   ret <4 x float> %1
   2906 }
   2907 
   2908 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
   2909 ; CHECK-LABEL: test_mm256_shuffle_ps:
   2910 ; CHECK:       # %bb.0:
   2911 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
   2912 ; CHECK-NEXT:    ret{{[l|q]}}
   2913   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
   2914   ret <8 x float> %res
   2915 }
   2916 
   2917 define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
   2918 ; X86-LABEL: test_mm256_mask_shuffle_ps:
   2919 ; X86:       # %bb.0:
   2920 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2921 ; X86-NEXT:    kmovw %eax, %k1
   2922 ; X86-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
   2923 ; X86-NEXT:    retl
   2924 ;
   2925 ; X64-LABEL: test_mm256_mask_shuffle_ps:
   2926 ; X64:       # %bb.0:
   2927 ; X64-NEXT:    kmovw %edi, %k1
   2928 ; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
   2929 ; X64-NEXT:    retq
   2930   %arg1 = bitcast i8 %a1 to <8 x i1>
   2931   %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
   2932   %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
   2933   ret <8 x float> %res1
   2934 }
   2935 
   2936 define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
   2937 ; X86-LABEL: test_mm256_maskz_shuffle_ps:
   2938 ; X86:       # %bb.0:
   2939 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2940 ; X86-NEXT:    kmovw %eax, %k1
   2941 ; X86-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
   2942 ; X86-NEXT:    retl
   2943 ;
   2944 ; X64-LABEL: test_mm256_maskz_shuffle_ps:
   2945 ; X64:       # %bb.0:
   2946 ; X64-NEXT:    kmovw %edi, %k1
   2947 ; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
   2948 ; X64-NEXT:    retq
   2949   %arg0 = bitcast i8 %a0 to <8 x i1>
   2950   %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
   2951   %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
   2952   ret <8 x float> %res1
   2953 }
   2954 
   2955 define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
   2956 ; X86-LABEL: test_mm256_mask_mul_epi32:
   2957 ; X86:       # %bb.0: # %entry
   2958 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2959 ; X86-NEXT:    kmovw %eax, %k1
   2960 ; X86-NEXT:    vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
   2961 ; X86-NEXT:    retl
   2962 ;
   2963 ; X64-LABEL: test_mm256_mask_mul_epi32:
   2964 ; X64:       # %bb.0: # %entry
   2965 ; X64-NEXT:    kmovw %edi, %k1
   2966 ; X64-NEXT:    vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
   2967 ; X64-NEXT:    retq
   2968 entry:
   2969   %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
   2970   %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
   2971   %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
   2972   %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
   2973   %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
   2974   %tmp5 = bitcast i8 %__M to <8 x i1>
   2975   %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2976   %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W
   2977   ret <4 x i64> %tmp6
   2978 }
   2979 
   2980 define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
   2981 ; X86-LABEL: test_mm256_maskz_mul_epi32:
   2982 ; X86:       # %bb.0:
   2983 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2984 ; X86-NEXT:    kmovw %eax, %k1
   2985 ; X86-NEXT:    vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
   2986 ; X86-NEXT:    retl
   2987 ;
   2988 ; X64-LABEL: test_mm256_maskz_mul_epi32:
   2989 ; X64:       # %bb.0:
   2990 ; X64-NEXT:    kmovw %edi, %k1
   2991 ; X64-NEXT:    vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
   2992 ; X64-NEXT:    retq
   2993   %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
   2994   %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
   2995   %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
   2996   %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
   2997   %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
   2998   %tmp5 = bitcast i8 %__M to <8 x i1>
   2999   %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3000   %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer
   3001   ret <4 x i64> %tmp6
   3002 }
   3003 
   3004 define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
   3005 ; X86-LABEL: test_mm_mask_mul_epi32:
   3006 ; X86:       # %bb.0:
   3007 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3008 ; X86-NEXT:    kmovw %eax, %k1
   3009 ; X86-NEXT:    vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
   3010 ; X86-NEXT:    retl
   3011 ;
   3012 ; X64-LABEL: test_mm_mask_mul_epi32:
   3013 ; X64:       # %bb.0:
   3014 ; X64-NEXT:    kmovw %edi, %k1
   3015 ; X64-NEXT:    vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
   3016 ; X64-NEXT:    retq
   3017   %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
   3018   %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
   3019   %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
   3020   %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
   3021   %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
   3022   %tmp5 = bitcast i8 %__M to <8 x i1>
   3023   %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3024   %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W
   3025   ret <2 x i64> %tmp6
   3026 }
   3027 
   3028 define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
   3029 ; X86-LABEL: test_mm_maskz_mul_epi32:
   3030 ; X86:       # %bb.0:
   3031 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3032 ; X86-NEXT:    kmovw %eax, %k1
   3033 ; X86-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
   3034 ; X86-NEXT:    retl
   3035 ;
   3036 ; X64-LABEL: test_mm_maskz_mul_epi32:
   3037 ; X64:       # %bb.0:
   3038 ; X64-NEXT:    kmovw %edi, %k1
   3039 ; X64-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
   3040 ; X64-NEXT:    retq
   3041   %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
   3042   %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
   3043   %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
   3044   %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
   3045   %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
   3046   %tmp5 = bitcast i8 %__M to <8 x i1>
   3047   %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3048   %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer
   3049   ret <2 x i64> %tmp6
   3050 }
   3051 
   3052 define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
   3053 ; X86-LABEL: test_mm256_mask_mul_epu32:
   3054 ; X86:       # %bb.0: # %entry
   3055 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3056 ; X86-NEXT:    kmovw %eax, %k1
   3057 ; X86-NEXT:    vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
   3058 ; X86-NEXT:    retl
   3059 ;
   3060 ; X64-LABEL: test_mm256_mask_mul_epu32:
   3061 ; X64:       # %bb.0: # %entry
   3062 ; X64-NEXT:    kmovw %edi, %k1
   3063 ; X64-NEXT:    vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
   3064 ; X64-NEXT:    retq
   3065 entry:
   3066   %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   3067   %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   3068   %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
   3069   %tmp3 = bitcast i8 %__M to <8 x i1>
   3070   %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3071   %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W
   3072   ret <4 x i64> %tmp4
   3073 }
   3074 
   3075 define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
   3076 ; X86-LABEL: test_mm256_maskz_mul_epu32:
   3077 ; X86:       # %bb.0: # %entry
   3078 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3079 ; X86-NEXT:    kmovw %eax, %k1
   3080 ; X86-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
   3081 ; X86-NEXT:    retl
   3082 ;
   3083 ; X64-LABEL: test_mm256_maskz_mul_epu32:
   3084 ; X64:       # %bb.0: # %entry
   3085 ; X64-NEXT:    kmovw %edi, %k1
   3086 ; X64-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
   3087 ; X64-NEXT:    retq
   3088 entry:
   3089   %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   3090   %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   3091   %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
   3092   %tmp3 = bitcast i8 %__M to <8 x i1>
   3093   %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3094   %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer
   3095   ret <4 x i64> %tmp4
   3096 }
   3097 
   3098 define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
   3099 ; X86-LABEL: test_mm_mask_mul_epu32:
   3100 ; X86:       # %bb.0: # %entry
   3101 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3102 ; X86-NEXT:    kmovw %eax, %k1
   3103 ; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
   3104 ; X86-NEXT:    retl
   3105 ;
   3106 ; X64-LABEL: test_mm_mask_mul_epu32:
   3107 ; X64:       # %bb.0: # %entry
   3108 ; X64-NEXT:    kmovw %edi, %k1
   3109 ; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
   3110 ; X64-NEXT:    retq
   3111 entry:
   3112   %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
   3113   %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
   3114   %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
   3115   %tmp3 = bitcast i8 %__M to <8 x i1>
   3116   %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3117   %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W
   3118   ret <2 x i64> %tmp4
   3119 }
   3120 
   3121 define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
   3122 ; X86-LABEL: test_mm_maskz_mul_epu32:
   3123 ; X86:       # %bb.0: # %entry
   3124 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3125 ; X86-NEXT:    kmovw %eax, %k1
   3126 ; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
   3127 ; X86-NEXT:    retl
   3128 ;
   3129 ; X64-LABEL: test_mm_maskz_mul_epu32:
   3130 ; X64:       # %bb.0: # %entry
   3131 ; X64-NEXT:    kmovw %edi, %k1
   3132 ; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
   3133 ; X64-NEXT:    retq
   3134 entry:
   3135   %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
   3136   %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
   3137   %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
   3138   %tmp3 = bitcast i8 %__M to <8 x i1>
   3139   %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3140   %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer
   3141   ret <2 x i64> %tmp4
   3142 }
   3143 
   3144 define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) {
   3145 ; CHECK-LABEL: test_mm_cvtepi32_epi8:
   3146 ; CHECK:       # %bb.0: # %entry
   3147 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
   3148 ; CHECK-NEXT:    ret{{[l|q]}}
   3149 entry:
   3150   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3151   %conv.i = trunc <4 x i32> %0 to <4 x i8>
   3152   %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   3153   %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
   3154   ret <2 x i64> %1
   3155 }
   3156 
   3157 define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) {
   3158 ; CHECK-LABEL: test_mm_cvtepi32_epi16:
   3159 ; CHECK:       # %bb.0: # %entry
   3160 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   3161 ; CHECK-NEXT:    ret{{[l|q]}}
   3162 entry:
   3163   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3164   %conv.i = trunc <4 x i32> %0 to <4 x i16>
   3165   %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3166   %1 = bitcast <8 x i16> %shuf.i to <2 x i64>
   3167   ret <2 x i64> %1
   3168 }
   3169 
   3170 define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) {
   3171 ; CHECK-LABEL: test_mm_cvtepi64_epi8:
   3172 ; CHECK:       # %bb.0: # %entry
   3173 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
   3174 ; CHECK-NEXT:    ret{{[l|q]}}
   3175 entry:
   3176   %conv.i = trunc <2 x i64> %__A to <2 x i8>
   3177   %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   3178   %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
   3179   ret <2 x i64> %0
   3180 }
   3181 
   3182 define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) {
   3183 ; CHECK-LABEL: test_mm_cvtepi64_epi16:
   3184 ; CHECK:       # %bb.0: # %entry
   3185 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
   3186 ; CHECK-NEXT:    ret{{[l|q]}}
   3187 entry:
   3188   %conv.i = trunc <2 x i64> %__A to <2 x i16>
   3189   %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   3190   %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
   3191   ret <2 x i64> %0
   3192 }
   3193 
   3194 define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) {
   3195 ; CHECK-LABEL: test_mm_cvtepi64_epi32:
   3196 ; CHECK:       # %bb.0: # %entry
   3197 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
   3198 ; CHECK-NEXT:    ret{{[l|q]}}
   3199 entry:
   3200   %conv.i = trunc <2 x i64> %__A to <2 x i32>
   3201   %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3202   %0 = bitcast <4 x i32> %shuf.i to <2 x i64>
   3203   ret <2 x i64> %0
   3204 }
   3205 
   3206 define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 {
   3207 ; CHECK-LABEL: test_mm256_cvtepi32_epi16:
   3208 ; CHECK:       # %bb.0: # %entry
   3209 ; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
   3210 ; CHECK-NEXT:    vzeroupper
   3211 ; CHECK-NEXT:    ret{{[l|q]}}
   3212 entry:
   3213   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3214   %conv.i = trunc <8 x i32> %0 to <8 x i16>
   3215   %1 = bitcast <8 x i16> %conv.i to <2 x i64>
   3216   ret <2 x i64> %1
   3217 }
   3218 
   3219 define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
   3220 ; X86-LABEL: test_mm256_mask_cvtepi32_epi16:
   3221 ; X86:       # %bb.0: # %entry
   3222 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3223 ; X86-NEXT:    kmovw %eax, %k1
   3224 ; X86-NEXT:    vpmovdw %ymm1, %xmm0 {%k1}
   3225 ; X86-NEXT:    vzeroupper
   3226 ; X86-NEXT:    retl
   3227 ;
   3228 ; X64-LABEL: test_mm256_mask_cvtepi32_epi16:
   3229 ; X64:       # %bb.0: # %entry
   3230 ; X64-NEXT:    kmovw %edi, %k1
   3231 ; X64-NEXT:    vpmovdw %ymm1, %xmm0 {%k1}
   3232 ; X64-NEXT:    vzeroupper
   3233 ; X64-NEXT:    retq
   3234 entry:
   3235   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3236   %1 = bitcast <2 x i64> %__O to <8 x i16>
   3237   %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M)
   3238   %3 = bitcast <8 x i16> %2 to <2 x i64>
   3239   ret <2 x i64> %3
   3240 }
   3241 
   3242 define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) {
   3243 ; X86-LABEL: test_mm256_maskz_cvtepi32_epi16:
   3244 ; X86:       # %bb.0: # %entry
   3245 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3246 ; X86-NEXT:    kmovw %eax, %k1
   3247 ; X86-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
   3248 ; X86-NEXT:    vzeroupper
   3249 ; X86-NEXT:    retl
   3250 ;
   3251 ; X64-LABEL: test_mm256_maskz_cvtepi32_epi16:
   3252 ; X64:       # %bb.0: # %entry
   3253 ; X64-NEXT:    kmovw %edi, %k1
   3254 ; X64-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
   3255 ; X64-NEXT:    vzeroupper
   3256 ; X64-NEXT:    retq
   3257 entry:
   3258   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3259   %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M)
   3260   %2 = bitcast <8 x i16> %1 to <2 x i64>
   3261   ret <2 x i64> %2
   3262 }
   3263 
   3264 define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 {
   3265 ; CHECK-LABEL: test_mm256_cvtepi64_epi32:
   3266 ; CHECK:       # %bb.0: # %entry
   3267 ; CHECK-NEXT:    vpmovqd %ymm0, %xmm0
   3268 ; CHECK-NEXT:    vzeroupper
   3269 ; CHECK-NEXT:    ret{{[l|q]}}
   3270 entry:
   3271   %conv.i = trunc <4 x i64> %__A to <4 x i32>
   3272   %0 = bitcast <4 x i32> %conv.i to <2 x i64>
   3273   ret <2 x i64> %0
   3274 }
   3275 
   3276 define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
   3277 ; X86-LABEL: test_mm256_mask_cvtepi64_epi32:
   3278 ; X86:       # %bb.0: # %entry
   3279 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3280 ; X86-NEXT:    kmovw %eax, %k1
   3281 ; X86-NEXT:    vpmovqd %ymm1, %xmm0 {%k1}
   3282 ; X86-NEXT:    vzeroupper
   3283 ; X86-NEXT:    retl
   3284 ;
   3285 ; X64-LABEL: test_mm256_mask_cvtepi64_epi32:
   3286 ; X64:       # %bb.0: # %entry
   3287 ; X64-NEXT:    kmovw %edi, %k1
   3288 ; X64-NEXT:    vpmovqd %ymm1, %xmm0 {%k1}
   3289 ; X64-NEXT:    vzeroupper
   3290 ; X64-NEXT:    retq
   3291 entry:
   3292   %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
   3293   %0 = bitcast <2 x i64> %__O to <4 x i32>
   3294   %1 = bitcast i8 %__M to <8 x i1>
   3295   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3296   %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0
   3297   %3 = bitcast <4 x i32> %2 to <2 x i64>
   3298   ret <2 x i64> %3
   3299 }
   3300 
   3301 define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) {
   3302 ; X86-LABEL: test_mm256_maskz_cvtepi64_epi32:
   3303 ; X86:       # %bb.0: # %entry
   3304 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3305 ; X86-NEXT:    kmovw %eax, %k1
   3306 ; X86-NEXT:    vpmovqd %ymm0, %xmm0 {%k1} {z}
   3307 ; X86-NEXT:    vzeroupper
   3308 ; X86-NEXT:    retl
   3309 ;
   3310 ; X64-LABEL: test_mm256_maskz_cvtepi64_epi32:
   3311 ; X64:       # %bb.0: # %entry
   3312 ; X64-NEXT:    kmovw %edi, %k1
   3313 ; X64-NEXT:    vpmovqd %ymm0, %xmm0 {%k1} {z}
   3314 ; X64-NEXT:    vzeroupper
   3315 ; X64-NEXT:    retq
   3316 entry:
   3317   %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
   3318   %0 = bitcast i8 %__M to <8 x i1>
   3319   %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3320   %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer
   3321   %2 = bitcast <4 x i32> %1 to <2 x i64>
   3322   ret <2 x i64> %2
   3323 }
   3324 
   3325 define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) {
   3326 ; CHECK-LABEL: test_mm256_cvtepi64_epi8:
   3327 ; CHECK:       # %bb.0: # %entry
   3328 ; CHECK-NEXT:    vpmovqb %ymm0, %xmm0
   3329 ; CHECK-NEXT:    vzeroupper
   3330 ; CHECK-NEXT:    ret{{[l|q]}}
   3331 entry:
   3332   %conv.i = trunc <4 x i64> %__A to <4 x i8>
   3333   %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   3334   %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
   3335   ret <2 x i64> %0
   3336 }
   3337 
   3338 define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) {
   3339 ; CHECK-LABEL: test_mm256_cvtepi64_epi16:
   3340 ; CHECK:       # %bb.0: # %entry
   3341 ; CHECK-NEXT:    vpmovqw %ymm0, %xmm0
   3342 ; CHECK-NEXT:    vzeroupper
   3343 ; CHECK-NEXT:    ret{{[l|q]}}
   3344 entry:
   3345   %conv.i = trunc <4 x i64> %__A to <4 x i16>
   3346   %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3347   %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
   3348   ret <2 x i64> %0
   3349 }
   3350 
   3351 define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) {
   3352 ; CHECK-LABEL: test_mm256_cvtepi32_epi8:
   3353 ; CHECK:       # %bb.0: # %entry
   3354 ; CHECK-NEXT:    vpmovdb %ymm0, %xmm0
   3355 ; CHECK-NEXT:    vzeroupper
   3356 ; CHECK-NEXT:    ret{{[l|q]}}
   3357 entry:
   3358   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3359   %conv.i = trunc <8 x i32> %0 to <8 x i8>
   3360   %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   3361   %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
   3362   ret <2 x i64> %1
   3363 }
   3364 
   3365 define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
   3366 ; CHECK-LABEL: test_mm_ternarylogic_epi32:
   3367 ; CHECK:       # %bb.0: # %entry
   3368 ; CHECK-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0
   3369 ; CHECK-NEXT:    ret{{[l|q]}}
   3370 entry:
   3371   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3372   %1 = bitcast <2 x i64> %__B to <4 x i32>
   3373   %2 = bitcast <2 x i64> %__C to <4 x i32>
   3374   %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
   3375   %4 = bitcast <4 x i32> %3 to <2 x i64>
   3376   ret <2 x i64> %4
   3377 }
   3378 
   3379 declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2
   3380 
   3381 define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
   3382 ; X86-LABEL: test_mm_mask_ternarylogic_epi32:
   3383 ; X86:       # %bb.0: # %entry
   3384 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3385 ; X86-NEXT:    kmovw %eax, %k1
   3386 ; X86-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
   3387 ; X86-NEXT:    retl
   3388 ;
   3389 ; X64-LABEL: test_mm_mask_ternarylogic_epi32:
   3390 ; X64:       # %bb.0: # %entry
   3391 ; X64-NEXT:    kmovw %edi, %k1
   3392 ; X64-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
   3393 ; X64-NEXT:    retq
   3394 entry:
   3395   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3396   %1 = bitcast <2 x i64> %__B to <4 x i32>
   3397   %2 = bitcast <2 x i64> %__C to <4 x i32>
   3398   %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
   3399   %4 = bitcast i8 %__U to <8 x i1>
   3400   %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3401   %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0
   3402   %6 = bitcast <4 x i32> %5 to <2 x i64>
   3403   ret <2 x i64> %6
   3404 }
   3405 
   3406 define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
   3407 ; X86-LABEL: test_mm_maskz_ternarylogic_epi32:
   3408 ; X86:       # %bb.0: # %entry
   3409 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3410 ; X86-NEXT:    kmovw %eax, %k1
   3411 ; X86-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
   3412 ; X86-NEXT:    retl
   3413 ;
   3414 ; X64-LABEL: test_mm_maskz_ternarylogic_epi32:
   3415 ; X64:       # %bb.0: # %entry
   3416 ; X64-NEXT:    kmovw %edi, %k1
   3417 ; X64-NEXT:    vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
   3418 ; X64-NEXT:    retq
   3419 entry:
   3420   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3421   %1 = bitcast <2 x i64> %__B to <4 x i32>
   3422   %2 = bitcast <2 x i64> %__C to <4 x i32>
   3423   %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
   3424   %4 = bitcast i8 %__U to <8 x i1>
   3425   %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3426   %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer
   3427   %6 = bitcast <4 x i32> %5 to <2 x i64>
   3428   ret <2 x i64> %6
   3429 }
   3430 
   3431 define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
   3432 ; CHECK-LABEL: test_mm256_ternarylogic_epi32:
   3433 ; CHECK:       # %bb.0: # %entry
   3434 ; CHECK-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0
   3435 ; CHECK-NEXT:    ret{{[l|q]}}
   3436 entry:
   3437   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3438   %1 = bitcast <4 x i64> %__B to <8 x i32>
   3439   %2 = bitcast <4 x i64> %__C to <8 x i32>
   3440   %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
   3441   %4 = bitcast <8 x i32> %3 to <4 x i64>
   3442   ret <4 x i64> %4
   3443 }
   3444 
   3445 declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2
   3446 
   3447 define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
   3448 ; X86-LABEL: test_mm256_mask_ternarylogic_epi32:
   3449 ; X86:       # %bb.0: # %entry
   3450 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3451 ; X86-NEXT:    kmovw %eax, %k1
   3452 ; X86-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
   3453 ; X86-NEXT:    retl
   3454 ;
   3455 ; X64-LABEL: test_mm256_mask_ternarylogic_epi32:
   3456 ; X64:       # %bb.0: # %entry
   3457 ; X64-NEXT:    kmovw %edi, %k1
   3458 ; X64-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
   3459 ; X64-NEXT:    retq
   3460 entry:
   3461   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3462   %1 = bitcast <4 x i64> %__B to <8 x i32>
   3463   %2 = bitcast <4 x i64> %__C to <8 x i32>
   3464   %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
   3465   %4 = bitcast i8 %__U to <8 x i1>
   3466   %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
   3467   %6 = bitcast <8 x i32> %5 to <4 x i64>
   3468   ret <4 x i64> %6
   3469 }
   3470 
   3471 define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
   3472 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi32:
   3473 ; X86:       # %bb.0: # %entry
   3474 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3475 ; X86-NEXT:    kmovw %eax, %k1
   3476 ; X86-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
   3477 ; X86-NEXT:    retl
   3478 ;
   3479 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi32:
   3480 ; X64:       # %bb.0: # %entry
   3481 ; X64-NEXT:    kmovw %edi, %k1
   3482 ; X64-NEXT:    vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
   3483 ; X64-NEXT:    retq
   3484 entry:
   3485   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3486   %1 = bitcast <4 x i64> %__B to <8 x i32>
   3487   %2 = bitcast <4 x i64> %__C to <8 x i32>
   3488   %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
   3489   %4 = bitcast i8 %__U to <8 x i1>
   3490   %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
   3491   %6 = bitcast <8 x i32> %5 to <4 x i64>
   3492   ret <4 x i64> %6
   3493 }
   3494 
   3495 define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
   3496 ; CHECK-LABEL: test_mm_ternarylogic_epi64:
   3497 ; CHECK:       # %bb.0: # %entry
   3498 ; CHECK-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0
   3499 ; CHECK-NEXT:    ret{{[l|q]}}
   3500 entry:
   3501   %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
   3502   ret <2 x i64> %0
   3503 }
   3504 
   3505 declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2
   3506 
   3507 define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
   3508 ; X86-LABEL: test_mm_mask_ternarylogic_epi64:
   3509 ; X86:       # %bb.0: # %entry
   3510 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3511 ; X86-NEXT:    kmovw %eax, %k1
   3512 ; X86-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
   3513 ; X86-NEXT:    retl
   3514 ;
   3515 ; X64-LABEL: test_mm_mask_ternarylogic_epi64:
   3516 ; X64:       # %bb.0: # %entry
   3517 ; X64-NEXT:    kmovw %edi, %k1
   3518 ; X64-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
   3519 ; X64-NEXT:    retq
   3520 entry:
   3521   %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
   3522   %1 = bitcast i8 %__U to <8 x i1>
   3523   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3524   %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A
   3525   ret <2 x i64> %2
   3526 }
   3527 
   3528 define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
   3529 ; X86-LABEL: test_mm_maskz_ternarylogic_epi64:
   3530 ; X86:       # %bb.0: # %entry
   3531 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3532 ; X86-NEXT:    kmovw %eax, %k1
   3533 ; X86-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
   3534 ; X86-NEXT:    retl
   3535 ;
   3536 ; X64-LABEL: test_mm_maskz_ternarylogic_epi64:
   3537 ; X64:       # %bb.0: # %entry
   3538 ; X64-NEXT:    kmovw %edi, %k1
   3539 ; X64-NEXT:    vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
   3540 ; X64-NEXT:    retq
   3541 entry:
   3542   %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
   3543   %1 = bitcast i8 %__U to <8 x i1>
   3544   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3545   %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
   3546   ret <2 x i64> %2
   3547 }
   3548 
   3549 define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
   3550 ; CHECK-LABEL: test_mm256_ternarylogic_epi64:
   3551 ; CHECK:       # %bb.0: # %entry
   3552 ; CHECK-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0
   3553 ; CHECK-NEXT:    ret{{[l|q]}}
   3554 entry:
   3555   %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
   3556   ret <4 x i64> %0
   3557 }
   3558 
   3559 declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2
   3560 
   3561 define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
   3562 ; X86-LABEL: test_mm256_mask_ternarylogic_epi64:
   3563 ; X86:       # %bb.0: # %entry
   3564 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3565 ; X86-NEXT:    kmovw %eax, %k1
   3566 ; X86-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
   3567 ; X86-NEXT:    retl
   3568 ;
   3569 ; X64-LABEL: test_mm256_mask_ternarylogic_epi64:
   3570 ; X64:       # %bb.0: # %entry
   3571 ; X64-NEXT:    kmovw %edi, %k1
   3572 ; X64-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
   3573 ; X64-NEXT:    retq
   3574 entry:
   3575   %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
   3576   %1 = bitcast i8 %__U to <8 x i1>
   3577   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3578   %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A
   3579   ret <4 x i64> %2
   3580 }
   3581 
   3582 define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
   3583 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi64:
   3584 ; X86:       # %bb.0: # %entry
   3585 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3586 ; X86-NEXT:    kmovw %eax, %k1
   3587 ; X86-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
   3588 ; X86-NEXT:    retl
   3589 ;
   3590 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi64:
   3591 ; X64:       # %bb.0: # %entry
   3592 ; X64-NEXT:    kmovw %edi, %k1
   3593 ; X64-NEXT:    vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
   3594 ; X64-NEXT:    retq
   3595 entry:
   3596   %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
   3597   %1 = bitcast i8 %__U to <8 x i1>
   3598   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3599   %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
   3600   ret <4 x i64> %2
   3601 }
   3602 
   3603 define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
   3604 ; X86-LABEL: test_mm_mask2_permutex2var_epi32:
   3605 ; X86:       # %bb.0: # %entry
   3606 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3607 ; X86-NEXT:    kmovw %eax, %k1
   3608 ; X86-NEXT:    vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
   3609 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
   3610 ; X86-NEXT:    retl
   3611 ;
   3612 ; X64-LABEL: test_mm_mask2_permutex2var_epi32:
   3613 ; X64:       # %bb.0: # %entry
   3614 ; X64-NEXT:    kmovw %edi, %k1
   3615 ; X64-NEXT:    vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
   3616 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
   3617 ; X64-NEXT:    retq
   3618 entry:
   3619   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3620   %1 = bitcast <2 x i64> %__I to <4 x i32>
   3621   %2 = bitcast <2 x i64> %__B to <4 x i32>
   3622   %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
   3623   %4 = bitcast i8 %__U to <8 x i1>
   3624   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3625   %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1
   3626   %6 = bitcast <4 x i32> %5 to <2 x i64>
   3627   ret <2 x i64> %6
   3628 }
   3629 
   3630 define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
   3631 ; X86-LABEL: test_mm256_mask2_permutex2var_epi32:
   3632 ; X86:       # %bb.0: # %entry
   3633 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3634 ; X86-NEXT:    kmovw %eax, %k1
   3635 ; X86-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
   3636 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
   3637 ; X86-NEXT:    retl
   3638 ;
   3639 ; X64-LABEL: test_mm256_mask2_permutex2var_epi32:
   3640 ; X64:       # %bb.0: # %entry
   3641 ; X64-NEXT:    kmovw %edi, %k1
   3642 ; X64-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
   3643 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
   3644 ; X64-NEXT:    retq
   3645 entry:
   3646   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3647   %1 = bitcast <4 x i64> %__I to <8 x i32>
   3648   %2 = bitcast <4 x i64> %__B to <8 x i32>
   3649   %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
   3650   %4 = bitcast i8 %__U to <8 x i1>
   3651   %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1
   3652   %6 = bitcast <8 x i32> %5 to <4 x i64>
   3653   ret <4 x i64> %6
   3654 }
   3655 
   3656 define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) {
   3657 ; X86-LABEL: test_mm_mask2_permutex2var_pd:
   3658 ; X86:       # %bb.0: # %entry
   3659 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3660 ; X86-NEXT:    kmovw %eax, %k1
   3661 ; X86-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
   3662 ; X86-NEXT:    vmovapd %xmm1, %xmm0
   3663 ; X86-NEXT:    retl
   3664 ;
   3665 ; X64-LABEL: test_mm_mask2_permutex2var_pd:
   3666 ; X64:       # %bb.0: # %entry
   3667 ; X64-NEXT:    kmovw %edi, %k1
   3668 ; X64-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
   3669 ; X64-NEXT:    vmovapd %xmm1, %xmm0
   3670 ; X64-NEXT:    retq
   3671 entry:
   3672   %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
   3673   %1 = bitcast <2 x i64> %__I to <2 x double>
   3674   %2 = bitcast i8 %__U to <8 x i1>
   3675   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3676   %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1
   3677   ret <2 x double> %3
   3678 }
   3679 
   3680 define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) {
   3681 ; X86-LABEL: test_mm256_mask2_permutex2var_pd:
   3682 ; X86:       # %bb.0: # %entry
   3683 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3684 ; X86-NEXT:    kmovw %eax, %k1
   3685 ; X86-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
   3686 ; X86-NEXT:    vmovapd %ymm1, %ymm0
   3687 ; X86-NEXT:    retl
   3688 ;
   3689 ; X64-LABEL: test_mm256_mask2_permutex2var_pd:
   3690 ; X64:       # %bb.0: # %entry
   3691 ; X64-NEXT:    kmovw %edi, %k1
   3692 ; X64-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
   3693 ; X64-NEXT:    vmovapd %ymm1, %ymm0
   3694 ; X64-NEXT:    retq
   3695 entry:
   3696   %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
   3697   %1 = bitcast <4 x i64> %__I to <4 x double>
   3698   %2 = bitcast i8 %__U to <8 x i1>
   3699   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3700   %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1
   3701   ret <4 x double> %3
   3702 }
   3703 
   3704 define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) {
   3705 ; X86-LABEL: test_mm_mask2_permutex2var_ps:
   3706 ; X86:       # %bb.0: # %entry
   3707 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3708 ; X86-NEXT:    kmovw %eax, %k1
   3709 ; X86-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
   3710 ; X86-NEXT:    vmovaps %xmm1, %xmm0
   3711 ; X86-NEXT:    retl
   3712 ;
   3713 ; X64-LABEL: test_mm_mask2_permutex2var_ps:
   3714 ; X64:       # %bb.0: # %entry
   3715 ; X64-NEXT:    kmovw %edi, %k1
   3716 ; X64-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
   3717 ; X64-NEXT:    vmovaps %xmm1, %xmm0
   3718 ; X64-NEXT:    retq
   3719 entry:
   3720   %0 = bitcast <2 x i64> %__I to <4 x i32>
   3721   %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
   3722   %2 = bitcast <2 x i64> %__I to <4 x float>
   3723   %3 = bitcast i8 %__U to <8 x i1>
   3724   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3725   %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2
   3726   ret <4 x float> %4
   3727 }
   3728 
   3729 define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) {
   3730 ; X86-LABEL: test_mm256_mask2_permutex2var_ps:
   3731 ; X86:       # %bb.0: # %entry
   3732 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3733 ; X86-NEXT:    kmovw %eax, %k1
   3734 ; X86-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
   3735 ; X86-NEXT:    vmovaps %ymm1, %ymm0
   3736 ; X86-NEXT:    retl
   3737 ;
   3738 ; X64-LABEL: test_mm256_mask2_permutex2var_ps:
   3739 ; X64:       # %bb.0: # %entry
   3740 ; X64-NEXT:    kmovw %edi, %k1
   3741 ; X64-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
   3742 ; X64-NEXT:    vmovaps %ymm1, %ymm0
   3743 ; X64-NEXT:    retq
   3744 entry:
   3745   %0 = bitcast <4 x i64> %__I to <8 x i32>
   3746   %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
   3747   %2 = bitcast <4 x i64> %__I to <8 x float>
   3748   %3 = bitcast i8 %__U to <8 x i1>
   3749   %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2
   3750   ret <8 x float> %4
   3751 }
   3752 
   3753 define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
   3754 ; X86-LABEL: test_mm_mask2_permutex2var_epi64:
   3755 ; X86:       # %bb.0: # %entry
   3756 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3757 ; X86-NEXT:    kmovw %eax, %k1
   3758 ; X86-NEXT:    vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
   3759 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
   3760 ; X86-NEXT:    retl
   3761 ;
   3762 ; X64-LABEL: test_mm_mask2_permutex2var_epi64:
   3763 ; X64:       # %bb.0: # %entry
   3764 ; X64-NEXT:    kmovw %edi, %k1
   3765 ; X64-NEXT:    vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
   3766 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
   3767 ; X64-NEXT:    retq
   3768 entry:
   3769   %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
   3770   %1 = bitcast i8 %__U to <8 x i1>
   3771   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3772   %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I
   3773   ret <2 x i64> %2
   3774 }
   3775 
   3776 define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
   3777 ; X86-LABEL: test_mm256_mask2_permutex2var_epi64:
   3778 ; X86:       # %bb.0: # %entry
   3779 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3780 ; X86-NEXT:    kmovw %eax, %k1
   3781 ; X86-NEXT:    vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
   3782 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
   3783 ; X86-NEXT:    retl
   3784 ;
   3785 ; X64-LABEL: test_mm256_mask2_permutex2var_epi64:
   3786 ; X64:       # %bb.0: # %entry
   3787 ; X64-NEXT:    kmovw %edi, %k1
   3788 ; X64-NEXT:    vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
   3789 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
   3790 ; X64-NEXT:    retq
   3791 entry:
   3792   %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
   3793   %1 = bitcast i8 %__U to <8 x i1>
   3794   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3795   %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I
   3796   ret <4 x i64> %2
   3797 }
   3798 
   3799 define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
   3800 ; CHECK-LABEL: test_mm_permutex2var_epi32:
   3801 ; CHECK:       # %bb.0: # %entry
   3802 ; CHECK-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0
   3803 ; CHECK-NEXT:    ret{{[l|q]}}
   3804 entry:
   3805   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3806   %1 = bitcast <2 x i64> %__I to <4 x i32>
   3807   %2 = bitcast <2 x i64> %__B to <4 x i32>
   3808   %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
   3809   %4 = bitcast <4 x i32> %3 to <2 x i64>
   3810   ret <2 x i64> %4
   3811 }
   3812 
   3813 define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
   3814 ; X86-LABEL: test_mm_mask_permutex2var_epi32:
   3815 ; X86:       # %bb.0: # %entry
   3816 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3817 ; X86-NEXT:    kmovw %eax, %k1
   3818 ; X86-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
   3819 ; X86-NEXT:    retl
   3820 ;
   3821 ; X64-LABEL: test_mm_mask_permutex2var_epi32:
   3822 ; X64:       # %bb.0: # %entry
   3823 ; X64-NEXT:    kmovw %edi, %k1
   3824 ; X64-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
   3825 ; X64-NEXT:    retq
   3826 entry:
   3827   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3828   %1 = bitcast <2 x i64> %__I to <4 x i32>
   3829   %2 = bitcast <2 x i64> %__B to <4 x i32>
   3830   %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
   3831   %4 = bitcast i8 %__U to <8 x i1>
   3832   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3833   %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
   3834   %6 = bitcast <4 x i32> %5 to <2 x i64>
   3835   ret <2 x i64> %6
   3836 }
   3837 
   3838 define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
   3839 ; X86-LABEL: test_mm_maskz_permutex2var_epi32:
   3840 ; X86:       # %bb.0: # %entry
   3841 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3842 ; X86-NEXT:    kmovw %eax, %k1
   3843 ; X86-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
   3844 ; X86-NEXT:    retl
   3845 ;
   3846 ; X64-LABEL: test_mm_maskz_permutex2var_epi32:
   3847 ; X64:       # %bb.0: # %entry
   3848 ; X64-NEXT:    kmovw %edi, %k1
   3849 ; X64-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
   3850 ; X64-NEXT:    retq
   3851 entry:
   3852   %0 = bitcast <2 x i64> %__A to <4 x i32>
   3853   %1 = bitcast <2 x i64> %__I to <4 x i32>
   3854   %2 = bitcast <2 x i64> %__B to <4 x i32>
   3855   %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
   3856   %4 = bitcast i8 %__U to <8 x i1>
   3857   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3858   %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
   3859   %6 = bitcast <4 x i32> %5 to <2 x i64>
   3860   ret <2 x i64> %6
   3861 }
   3862 
   3863 define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
   3864 ; CHECK-LABEL: test_mm256_permutex2var_epi32:
   3865 ; CHECK:       # %bb.0: # %entry
   3866 ; CHECK-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0
   3867 ; CHECK-NEXT:    ret{{[l|q]}}
   3868 entry:
   3869   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3870   %1 = bitcast <4 x i64> %__I to <8 x i32>
   3871   %2 = bitcast <4 x i64> %__B to <8 x i32>
   3872   %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
   3873   %4 = bitcast <8 x i32> %3 to <4 x i64>
   3874   ret <4 x i64> %4
   3875 }
   3876 
   3877 define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
   3878 ; X86-LABEL: test_mm256_mask_permutex2var_epi32:
   3879 ; X86:       # %bb.0: # %entry
   3880 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3881 ; X86-NEXT:    kmovw %eax, %k1
   3882 ; X86-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
   3883 ; X86-NEXT:    retl
   3884 ;
   3885 ; X64-LABEL: test_mm256_mask_permutex2var_epi32:
   3886 ; X64:       # %bb.0: # %entry
   3887 ; X64-NEXT:    kmovw %edi, %k1
   3888 ; X64-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
   3889 ; X64-NEXT:    retq
   3890 entry:
   3891   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3892   %1 = bitcast <4 x i64> %__I to <8 x i32>
   3893   %2 = bitcast <4 x i64> %__B to <8 x i32>
   3894   %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
   3895   %4 = bitcast i8 %__U to <8 x i1>
   3896   %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
   3897   %6 = bitcast <8 x i32> %5 to <4 x i64>
   3898   ret <4 x i64> %6
   3899 }
   3900 
   3901 define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
   3902 ; X86-LABEL: test_mm256_maskz_permutex2var_epi32:
   3903 ; X86:       # %bb.0: # %entry
   3904 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3905 ; X86-NEXT:    kmovw %eax, %k1
   3906 ; X86-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
   3907 ; X86-NEXT:    retl
   3908 ;
   3909 ; X64-LABEL: test_mm256_maskz_permutex2var_epi32:
   3910 ; X64:       # %bb.0: # %entry
   3911 ; X64-NEXT:    kmovw %edi, %k1
   3912 ; X64-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
   3913 ; X64-NEXT:    retq
   3914 entry:
   3915   %0 = bitcast <4 x i64> %__A to <8 x i32>
   3916   %1 = bitcast <4 x i64> %__I to <8 x i32>
   3917   %2 = bitcast <4 x i64> %__B to <8 x i32>
   3918   %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
   3919   %4 = bitcast i8 %__U to <8 x i1>
   3920   %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
   3921   %6 = bitcast <8 x i32> %5 to <4 x i64>
   3922   ret <4 x i64> %6
   3923 }
   3924 
   3925 define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
   3926 ; CHECK-LABEL: test_mm_permutex2var_pd:
   3927 ; CHECK:       # %bb.0: # %entry
   3928 ; CHECK-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0
   3929 ; CHECK-NEXT:    ret{{[l|q]}}
   3930 entry:
   3931   %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
   3932   ret <2 x double> %0
   3933 }
   3934 
   3935 define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) {
   3936 ; X86-LABEL: test_mm_mask_permutex2var_pd:
   3937 ; X86:       # %bb.0: # %entry
   3938 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3939 ; X86-NEXT:    kmovw %eax, %k1
   3940 ; X86-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
   3941 ; X86-NEXT:    retl
   3942 ;
   3943 ; X64-LABEL: test_mm_mask_permutex2var_pd:
   3944 ; X64:       # %bb.0: # %entry
   3945 ; X64-NEXT:    kmovw %edi, %k1
   3946 ; X64-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
   3947 ; X64-NEXT:    retq
   3948 entry:
   3949   %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
   3950   %1 = bitcast i8 %__U to <8 x i1>
   3951   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3952   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
   3953   ret <2 x double> %2
   3954 }
   3955 
   3956 define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
   3957 ; X86-LABEL: test_mm_maskz_permutex2var_pd:
   3958 ; X86:       # %bb.0: # %entry
   3959 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3960 ; X86-NEXT:    kmovw %eax, %k1
   3961 ; X86-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
   3962 ; X86-NEXT:    retl
   3963 ;
   3964 ; X64-LABEL: test_mm_maskz_permutex2var_pd:
   3965 ; X64:       # %bb.0: # %entry
   3966 ; X64-NEXT:    kmovw %edi, %k1
   3967 ; X64-NEXT:    vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
   3968 ; X64-NEXT:    retq
   3969 entry:
   3970   %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
   3971   %1 = bitcast i8 %__U to <8 x i1>
   3972   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   3973   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
   3974   ret <2 x double> %2
   3975 }
   3976 
   3977 define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
   3978 ; CHECK-LABEL: test_mm256_permutex2var_pd:
   3979 ; CHECK:       # %bb.0: # %entry
   3980 ; CHECK-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0
   3981 ; CHECK-NEXT:    ret{{[l|q]}}
   3982 entry:
   3983   %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
   3984   ret <4 x double> %0
   3985 }
   3986 
   3987 define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) {
   3988 ; X86-LABEL: test_mm256_mask_permutex2var_pd:
   3989 ; X86:       # %bb.0: # %entry
   3990 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3991 ; X86-NEXT:    kmovw %eax, %k1
   3992 ; X86-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
   3993 ; X86-NEXT:    retl
   3994 ;
   3995 ; X64-LABEL: test_mm256_mask_permutex2var_pd:
   3996 ; X64:       # %bb.0: # %entry
   3997 ; X64-NEXT:    kmovw %edi, %k1
   3998 ; X64-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
   3999 ; X64-NEXT:    retq
   4000 entry:
   4001   %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
   4002   %1 = bitcast i8 %__U to <8 x i1>
   4003   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4004   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
   4005   ret <4 x double> %2
   4006 }
   4007 
   4008 define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
   4009 ; X86-LABEL: test_mm256_maskz_permutex2var_pd:
   4010 ; X86:       # %bb.0: # %entry
   4011 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4012 ; X86-NEXT:    kmovw %eax, %k1
   4013 ; X86-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
   4014 ; X86-NEXT:    retl
   4015 ;
   4016 ; X64-LABEL: test_mm256_maskz_permutex2var_pd:
   4017 ; X64:       # %bb.0: # %entry
   4018 ; X64-NEXT:    kmovw %edi, %k1
   4019 ; X64-NEXT:    vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
   4020 ; X64-NEXT:    retq
   4021 entry:
   4022   %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
   4023   %1 = bitcast i8 %__U to <8 x i1>
   4024   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4025   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
   4026   ret <4 x double> %2
   4027 }
   4028 
   4029 define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
   4030 ; CHECK-LABEL: test_mm_permutex2var_ps:
   4031 ; CHECK:       # %bb.0: # %entry
   4032 ; CHECK-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0
   4033 ; CHECK-NEXT:    ret{{[l|q]}}
   4034 entry:
   4035   %0 = bitcast <2 x i64> %__I to <4 x i32>
   4036   %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
   4037   ret <4 x float> %1
   4038 }
   4039 
   4040 define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) {
   4041 ; X86-LABEL: test_mm_mask_permutex2var_ps:
   4042 ; X86:       # %bb.0: # %entry
   4043 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4044 ; X86-NEXT:    kmovw %eax, %k1
   4045 ; X86-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
   4046 ; X86-NEXT:    retl
   4047 ;
   4048 ; X64-LABEL: test_mm_mask_permutex2var_ps:
   4049 ; X64:       # %bb.0: # %entry
   4050 ; X64-NEXT:    kmovw %edi, %k1
   4051 ; X64-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
   4052 ; X64-NEXT:    retq
   4053 entry:
   4054   %0 = bitcast <2 x i64> %__I to <4 x i32>
   4055   %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
   4056   %2 = bitcast i8 %__U to <8 x i1>
   4057   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4058   %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A
   4059   ret <4 x float> %3
   4060 }
   4061 
   4062 define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
   4063 ; X86-LABEL: test_mm_maskz_permutex2var_ps:
   4064 ; X86:       # %bb.0: # %entry
   4065 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4066 ; X86-NEXT:    kmovw %eax, %k1
   4067 ; X86-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
   4068 ; X86-NEXT:    retl
   4069 ;
   4070 ; X64-LABEL: test_mm_maskz_permutex2var_ps:
   4071 ; X64:       # %bb.0: # %entry
   4072 ; X64-NEXT:    kmovw %edi, %k1
   4073 ; X64-NEXT:    vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
   4074 ; X64-NEXT:    retq
   4075 entry:
   4076   %0 = bitcast <2 x i64> %__I to <4 x i32>
   4077   %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
   4078   %2 = bitcast i8 %__U to <8 x i1>
   4079   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4080   %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer
   4081   ret <4 x float> %3
   4082 }
   4083 
   4084 define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
   4085 ; CHECK-LABEL: test_mm256_permutex2var_ps:
   4086 ; CHECK:       # %bb.0: # %entry
   4087 ; CHECK-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0
   4088 ; CHECK-NEXT:    ret{{[l|q]}}
   4089 entry:
   4090   %0 = bitcast <4 x i64> %__I to <8 x i32>
   4091   %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
   4092   ret <8 x float> %1
   4093 }
   4094 
   4095 define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) {
   4096 ; X86-LABEL: test_mm256_mask_permutex2var_ps:
   4097 ; X86:       # %bb.0: # %entry
   4098 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4099 ; X86-NEXT:    kmovw %eax, %k1
   4100 ; X86-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
   4101 ; X86-NEXT:    retl
   4102 ;
   4103 ; X64-LABEL: test_mm256_mask_permutex2var_ps:
   4104 ; X64:       # %bb.0: # %entry
   4105 ; X64-NEXT:    kmovw %edi, %k1
   4106 ; X64-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
   4107 ; X64-NEXT:    retq
   4108 entry:
   4109   %0 = bitcast <4 x i64> %__I to <8 x i32>
   4110   %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
   4111   %2 = bitcast i8 %__U to <8 x i1>
   4112   %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A
   4113   ret <8 x float> %3
   4114 }
   4115 
   4116 define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
   4117 ; X86-LABEL: test_mm256_maskz_permutex2var_ps:
   4118 ; X86:       # %bb.0: # %entry
   4119 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4120 ; X86-NEXT:    kmovw %eax, %k1
   4121 ; X86-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
   4122 ; X86-NEXT:    retl
   4123 ;
   4124 ; X64-LABEL: test_mm256_maskz_permutex2var_ps:
   4125 ; X64:       # %bb.0: # %entry
   4126 ; X64-NEXT:    kmovw %edi, %k1
   4127 ; X64-NEXT:    vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
   4128 ; X64-NEXT:    retq
   4129 entry:
   4130   %0 = bitcast <4 x i64> %__I to <8 x i32>
   4131   %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
   4132   %2 = bitcast i8 %__U to <8 x i1>
   4133   %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
   4134   ret <8 x float> %3
   4135 }
   4136 
   4137 define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
   4138 ; CHECK-LABEL: test_mm_permutex2var_epi64:
   4139 ; CHECK:       # %bb.0: # %entry
   4140 ; CHECK-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0
   4141 ; CHECK-NEXT:    ret{{[l|q]}}
   4142 entry:
   4143   %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
   4144   ret <2 x i64> %0
   4145 }
   4146 
   4147 define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
   4148 ; X86-LABEL: test_mm_mask_permutex2var_epi64:
   4149 ; X86:       # %bb.0: # %entry
   4150 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4151 ; X86-NEXT:    kmovw %eax, %k1
   4152 ; X86-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
   4153 ; X86-NEXT:    retl
   4154 ;
   4155 ; X64-LABEL: test_mm_mask_permutex2var_epi64:
   4156 ; X64:       # %bb.0: # %entry
   4157 ; X64-NEXT:    kmovw %edi, %k1
   4158 ; X64-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
   4159 ; X64-NEXT:    retq
   4160 entry:
   4161   %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
   4162   %1 = bitcast i8 %__U to <8 x i1>
   4163   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4164   %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A
   4165   ret <2 x i64> %2
   4166 }
   4167 
   4168 define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
   4169 ; X86-LABEL: test_mm_maskz_permutex2var_epi64:
   4170 ; X86:       # %bb.0: # %entry
   4171 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4172 ; X86-NEXT:    kmovw %eax, %k1
   4173 ; X86-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
   4174 ; X86-NEXT:    retl
   4175 ;
   4176 ; X64-LABEL: test_mm_maskz_permutex2var_epi64:
   4177 ; X64:       # %bb.0: # %entry
   4178 ; X64-NEXT:    kmovw %edi, %k1
   4179 ; X64-NEXT:    vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
   4180 ; X64-NEXT:    retq
   4181 entry:
   4182   %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
   4183   %1 = bitcast i8 %__U to <8 x i1>
   4184   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4185   %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
   4186   ret <2 x i64> %2
   4187 }
   4188 
   4189 define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
   4190 ; CHECK-LABEL: test_mm256_permutex2var_epi64:
   4191 ; CHECK:       # %bb.0: # %entry
   4192 ; CHECK-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0
   4193 ; CHECK-NEXT:    ret{{[l|q]}}
   4194 entry:
   4195   %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
   4196   ret <4 x i64> %0
   4197 }
   4198 
   4199 define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
   4200 ; X86-LABEL: test_mm256_mask_permutex2var_epi64:
   4201 ; X86:       # %bb.0: # %entry
   4202 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4203 ; X86-NEXT:    kmovw %eax, %k1
   4204 ; X86-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
   4205 ; X86-NEXT:    retl
   4206 ;
   4207 ; X64-LABEL: test_mm256_mask_permutex2var_epi64:
   4208 ; X64:       # %bb.0: # %entry
   4209 ; X64-NEXT:    kmovw %edi, %k1
   4210 ; X64-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
   4211 ; X64-NEXT:    retq
   4212 entry:
   4213   %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
   4214   %1 = bitcast i8 %__U to <8 x i1>
   4215   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4216   %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A
   4217   ret <4 x i64> %2
   4218 }
   4219 
   4220 define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
   4221 ; X86-LABEL: test_mm256_maskz_permutex2var_epi64:
   4222 ; X86:       # %bb.0: # %entry
   4223 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4224 ; X86-NEXT:    kmovw %eax, %k1
   4225 ; X86-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
   4226 ; X86-NEXT:    retl
   4227 ;
   4228 ; X64-LABEL: test_mm256_maskz_permutex2var_epi64:
   4229 ; X64:       # %bb.0: # %entry
   4230 ; X64-NEXT:    kmovw %edi, %k1
   4231 ; X64-NEXT:    vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
   4232 ; X64-NEXT:    retq
   4233 entry:
   4234   %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
   4235   %1 = bitcast i8 %__U to <8 x i1>
   4236   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4237   %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
   4238   ret <4 x i64> %2
   4239 }
   4240 
   4241 
   4242 define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
   4243 ; X86-LABEL: test_mm_mask_fmadd_pd:
   4244 ; X86:       # %bb.0: # %entry
   4245 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4246 ; X86-NEXT:    kmovw %eax, %k1
   4247 ; X86-NEXT:    vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
   4248 ; X86-NEXT:    retl
   4249 ;
   4250 ; X64-LABEL: test_mm_mask_fmadd_pd:
   4251 ; X64:       # %bb.0: # %entry
   4252 ; X64-NEXT:    kmovw %edi, %k1
   4253 ; X64-NEXT:    vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
   4254 ; X64-NEXT:    retq
   4255 entry:
   4256   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   4257   %1 = bitcast i8 %__U to <8 x i1>
   4258   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4259   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
   4260   ret <2 x double> %2
   4261 }
   4262 
   4263 define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
   4264 ; X86-LABEL: test_mm_mask_fmsub_pd:
   4265 ; X86:       # %bb.0: # %entry
   4266 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4267 ; X86-NEXT:    kmovw %eax, %k1
   4268 ; X86-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
   4269 ; X86-NEXT:    retl
   4270 ;
   4271 ; X64-LABEL: test_mm_mask_fmsub_pd:
   4272 ; X64:       # %bb.0: # %entry
   4273 ; X64-NEXT:    kmovw %edi, %k1
   4274 ; X64-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
   4275 ; X64-NEXT:    retq
   4276 entry:
   4277   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   4278   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
   4279   %1 = bitcast i8 %__U to <8 x i1>
   4280   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4281   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
   4282   ret <2 x double> %2
   4283 }
   4284 
   4285 define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
   4286 ; X86-LABEL: test_mm_mask3_fmadd_pd:
   4287 ; X86:       # %bb.0: # %entry
   4288 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4289 ; X86-NEXT:    kmovw %eax, %k1
   4290 ; X86-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
   4291 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   4292 ; X86-NEXT:    retl
   4293 ;
   4294 ; X64-LABEL: test_mm_mask3_fmadd_pd:
   4295 ; X64:       # %bb.0: # %entry
   4296 ; X64-NEXT:    kmovw %edi, %k1
   4297 ; X64-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
   4298 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   4299 ; X64-NEXT:    retq
   4300 entry:
   4301   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   4302   %1 = bitcast i8 %__U to <8 x i1>
   4303   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4304   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
   4305   ret <2 x double> %2
   4306 }
   4307 
   4308 define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
   4309 ; X86-LABEL: test_mm_mask3_fnmadd_pd:
   4310 ; X86:       # %bb.0: # %entry
   4311 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4312 ; X86-NEXT:    kmovw %eax, %k1
   4313 ; X86-NEXT:    vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
   4314 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   4315 ; X86-NEXT:    retl
   4316 ;
   4317 ; X64-LABEL: test_mm_mask3_fnmadd_pd:
   4318 ; X64:       # %bb.0: # %entry
   4319 ; X64-NEXT:    kmovw %edi, %k1
   4320 ; X64-NEXT:    vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
   4321 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   4322 ; X64-NEXT:    retq
   4323 entry:
   4324   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
   4325   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
   4326   %1 = bitcast i8 %__U to <8 x i1>
   4327   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4328   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
   4329   ret <2 x double> %2
   4330 }
   4331 
   4332 define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   4333 ; X86-LABEL: test_mm_maskz_fmadd_pd:
   4334 ; X86:       # %bb.0: # %entry
   4335 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4336 ; X86-NEXT:    kmovw %eax, %k1
   4337 ; X86-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4338 ; X86-NEXT:    retl
   4339 ;
   4340 ; X64-LABEL: test_mm_maskz_fmadd_pd:
   4341 ; X64:       # %bb.0: # %entry
   4342 ; X64-NEXT:    kmovw %edi, %k1
   4343 ; X64-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4344 ; X64-NEXT:    retq
   4345 entry:
   4346   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   4347   %1 = bitcast i8 %__U to <8 x i1>
   4348   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4349   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
   4350   ret <2 x double> %2
   4351 }
   4352 
   4353 define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   4354 ; X86-LABEL: test_mm_maskz_fmsub_pd:
   4355 ; X86:       # %bb.0: # %entry
   4356 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4357 ; X86-NEXT:    kmovw %eax, %k1
   4358 ; X86-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   4359 ; X86-NEXT:    retl
   4360 ;
   4361 ; X64-LABEL: test_mm_maskz_fmsub_pd:
   4362 ; X64:       # %bb.0: # %entry
   4363 ; X64-NEXT:    kmovw %edi, %k1
   4364 ; X64-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   4365 ; X64-NEXT:    retq
   4366 entry:
   4367   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   4368   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
   4369   %1 = bitcast i8 %__U to <8 x i1>
   4370   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4371   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
   4372   ret <2 x double> %2
   4373 }
   4374 
   4375 define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   4376 ; X86-LABEL: test_mm_maskz_fnmadd_pd:
   4377 ; X86:       # %bb.0: # %entry
   4378 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4379 ; X86-NEXT:    kmovw %eax, %k1
   4380 ; X86-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   4381 ; X86-NEXT:    retl
   4382 ;
   4383 ; X64-LABEL: test_mm_maskz_fnmadd_pd:
   4384 ; X64:       # %bb.0: # %entry
   4385 ; X64-NEXT:    kmovw %edi, %k1
   4386 ; X64-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   4387 ; X64-NEXT:    retq
   4388 entry:
   4389   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
   4390   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
   4391   %1 = bitcast i8 %__U to <8 x i1>
   4392   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4393   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
   4394   ret <2 x double> %2
   4395 }
   4396 
   4397 define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   4398 ; X86-LABEL: test_mm_maskz_fnmsub_pd:
   4399 ; X86:       # %bb.0: # %entry
   4400 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4401 ; X86-NEXT:    kmovw %eax, %k1
   4402 ; X86-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   4403 ; X86-NEXT:    retl
   4404 ;
   4405 ; X64-LABEL: test_mm_maskz_fnmsub_pd:
   4406 ; X64:       # %bb.0: # %entry
   4407 ; X64-NEXT:    kmovw %edi, %k1
   4408 ; X64-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   4409 ; X64-NEXT:    retq
   4410 entry:
   4411   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
   4412   %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   4413   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
   4414   %1 = bitcast i8 %__U to <8 x i1>
   4415   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4416   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
   4417   ret <2 x double> %2
   4418 }
   4419 
   4420 define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
   4421 ; X86-LABEL: test_mm256_mask_fmadd_pd:
   4422 ; X86:       # %bb.0: # %entry
   4423 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4424 ; X86-NEXT:    kmovw %eax, %k1
   4425 ; X86-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
   4426 ; X86-NEXT:    retl
   4427 ;
   4428 ; X64-LABEL: test_mm256_mask_fmadd_pd:
   4429 ; X64:       # %bb.0: # %entry
   4430 ; X64-NEXT:    kmovw %edi, %k1
   4431 ; X64-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
   4432 ; X64-NEXT:    retq
   4433 entry:
   4434   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   4435   %1 = bitcast i8 %__U to <8 x i1>
   4436   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4437   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
   4438   ret <4 x double> %2
   4439 }
   4440 
   4441 define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
   4442 ; X86-LABEL: test_mm256_mask_fmsub_pd:
   4443 ; X86:       # %bb.0: # %entry
   4444 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4445 ; X86-NEXT:    kmovw %eax, %k1
   4446 ; X86-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
   4447 ; X86-NEXT:    retl
   4448 ;
   4449 ; X64-LABEL: test_mm256_mask_fmsub_pd:
   4450 ; X64:       # %bb.0: # %entry
   4451 ; X64-NEXT:    kmovw %edi, %k1
   4452 ; X64-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
   4453 ; X64-NEXT:    retq
   4454 entry:
   4455   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4456   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
   4457   %1 = bitcast i8 %__U to <8 x i1>
   4458   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4459   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
   4460   ret <4 x double> %2
   4461 }
   4462 
   4463 define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
   4464 ; X86-LABEL: test_mm256_mask3_fmadd_pd:
   4465 ; X86:       # %bb.0: # %entry
   4466 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4467 ; X86-NEXT:    kmovw %eax, %k1
   4468 ; X86-NEXT:    vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
   4469 ; X86-NEXT:    vmovapd %ymm2, %ymm0
   4470 ; X86-NEXT:    retl
   4471 ;
   4472 ; X64-LABEL: test_mm256_mask3_fmadd_pd:
   4473 ; X64:       # %bb.0: # %entry
   4474 ; X64-NEXT:    kmovw %edi, %k1
   4475 ; X64-NEXT:    vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
   4476 ; X64-NEXT:    vmovapd %ymm2, %ymm0
   4477 ; X64-NEXT:    retq
   4478 entry:
   4479   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   4480   %1 = bitcast i8 %__U to <8 x i1>
   4481   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4482   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
   4483   ret <4 x double> %2
   4484 }
   4485 
   4486 define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
   4487 ; X86-LABEL: test_mm256_mask3_fnmadd_pd:
   4488 ; X86:       # %bb.0: # %entry
   4489 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4490 ; X86-NEXT:    kmovw %eax, %k1
   4491 ; X86-NEXT:    vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
   4492 ; X86-NEXT:    vmovapd %ymm2, %ymm0
   4493 ; X86-NEXT:    retl
   4494 ;
   4495 ; X64-LABEL: test_mm256_mask3_fnmadd_pd:
   4496 ; X64:       # %bb.0: # %entry
   4497 ; X64-NEXT:    kmovw %edi, %k1
   4498 ; X64-NEXT:    vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
   4499 ; X64-NEXT:    vmovapd %ymm2, %ymm0
   4500 ; X64-NEXT:    retq
   4501 entry:
   4502   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   4503   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
   4504   %1 = bitcast i8 %__U to <8 x i1>
   4505   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4506   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
   4507   ret <4 x double> %2
   4508 }
   4509 
   4510 define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
   4511 ; X86-LABEL: test_mm256_maskz_fmadd_pd:
   4512 ; X86:       # %bb.0: # %entry
   4513 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4514 ; X86-NEXT:    kmovw %eax, %k1
   4515 ; X86-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
   4516 ; X86-NEXT:    retl
   4517 ;
   4518 ; X64-LABEL: test_mm256_maskz_fmadd_pd:
   4519 ; X64:       # %bb.0: # %entry
   4520 ; X64-NEXT:    kmovw %edi, %k1
   4521 ; X64-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
   4522 ; X64-NEXT:    retq
   4523 entry:
   4524   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   4525   %1 = bitcast i8 %__U to <8 x i1>
   4526   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4527   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
   4528   ret <4 x double> %2
   4529 }
   4530 
   4531 define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
   4532 ; X86-LABEL: test_mm256_maskz_fmsub_pd:
   4533 ; X86:       # %bb.0: # %entry
   4534 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4535 ; X86-NEXT:    kmovw %eax, %k1
   4536 ; X86-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
   4537 ; X86-NEXT:    retl
   4538 ;
   4539 ; X64-LABEL: test_mm256_maskz_fmsub_pd:
   4540 ; X64:       # %bb.0: # %entry
   4541 ; X64-NEXT:    kmovw %edi, %k1
   4542 ; X64-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
   4543 ; X64-NEXT:    retq
   4544 entry:
   4545   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4546   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
   4547   %1 = bitcast i8 %__U to <8 x i1>
   4548   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4549   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
   4550   ret <4 x double> %2
   4551 }
   4552 
   4553 define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
   4554 ; X86-LABEL: test_mm256_maskz_fnmadd_pd:
   4555 ; X86:       # %bb.0: # %entry
   4556 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4557 ; X86-NEXT:    kmovw %eax, %k1
   4558 ; X86-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
   4559 ; X86-NEXT:    retl
   4560 ;
   4561 ; X64-LABEL: test_mm256_maskz_fnmadd_pd:
   4562 ; X64:       # %bb.0: # %entry
   4563 ; X64-NEXT:    kmovw %edi, %k1
   4564 ; X64-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
   4565 ; X64-NEXT:    retq
   4566 entry:
   4567   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   4568   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
   4569   %1 = bitcast i8 %__U to <8 x i1>
   4570   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4571   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
   4572   ret <4 x double> %2
   4573 }
   4574 
   4575 define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
   4576 ; X86-LABEL: test_mm256_maskz_fnmsub_pd:
   4577 ; X86:       # %bb.0: # %entry
   4578 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4579 ; X86-NEXT:    kmovw %eax, %k1
   4580 ; X86-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
   4581 ; X86-NEXT:    retl
   4582 ;
   4583 ; X64-LABEL: test_mm256_maskz_fnmsub_pd:
   4584 ; X64:       # %bb.0: # %entry
   4585 ; X64-NEXT:    kmovw %edi, %k1
   4586 ; X64-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
   4587 ; X64-NEXT:    retq
   4588 entry:
   4589   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   4590   %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4591   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
   4592   %1 = bitcast i8 %__U to <8 x i1>
   4593   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4594   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
   4595   ret <4 x double> %2
   4596 }
   4597 
   4598 define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
   4599 ; X86-LABEL: test_mm_mask_fmadd_ps:
   4600 ; X86:       # %bb.0: # %entry
   4601 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4602 ; X86-NEXT:    kmovw %eax, %k1
   4603 ; X86-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
   4604 ; X86-NEXT:    retl
   4605 ;
   4606 ; X64-LABEL: test_mm_mask_fmadd_ps:
   4607 ; X64:       # %bb.0: # %entry
   4608 ; X64-NEXT:    kmovw %edi, %k1
   4609 ; X64-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
   4610 ; X64-NEXT:    retq
   4611 entry:
   4612   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   4613   %1 = bitcast i8 %__U to <8 x i1>
   4614   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4615   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
   4616   ret <4 x float> %2
   4617 }
   4618 
   4619 define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
   4620 ; X86-LABEL: test_mm_mask_fmsub_ps:
   4621 ; X86:       # %bb.0: # %entry
   4622 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4623 ; X86-NEXT:    kmovw %eax, %k1
   4624 ; X86-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
   4625 ; X86-NEXT:    retl
   4626 ;
   4627 ; X64-LABEL: test_mm_mask_fmsub_ps:
   4628 ; X64:       # %bb.0: # %entry
   4629 ; X64-NEXT:    kmovw %edi, %k1
   4630 ; X64-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
   4631 ; X64-NEXT:    retq
   4632 entry:
   4633   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4634   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
   4635   %1 = bitcast i8 %__U to <8 x i1>
   4636   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4637   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
   4638   ret <4 x float> %2
   4639 }
   4640 
   4641 define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
   4642 ; X86-LABEL: test_mm_mask3_fmadd_ps:
   4643 ; X86:       # %bb.0: # %entry
   4644 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4645 ; X86-NEXT:    kmovw %eax, %k1
   4646 ; X86-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
   4647 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   4648 ; X86-NEXT:    retl
   4649 ;
   4650 ; X64-LABEL: test_mm_mask3_fmadd_ps:
   4651 ; X64:       # %bb.0: # %entry
   4652 ; X64-NEXT:    kmovw %edi, %k1
   4653 ; X64-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
   4654 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   4655 ; X64-NEXT:    retq
   4656 entry:
   4657   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   4658   %1 = bitcast i8 %__U to <8 x i1>
   4659   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4660   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
   4661   ret <4 x float> %2
   4662 }
   4663 
   4664 define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
   4665 ; X86-LABEL: test_mm_mask3_fnmadd_ps:
   4666 ; X86:       # %bb.0: # %entry
   4667 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4668 ; X86-NEXT:    kmovw %eax, %k1
   4669 ; X86-NEXT:    vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
   4670 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   4671 ; X86-NEXT:    retl
   4672 ;
   4673 ; X64-LABEL: test_mm_mask3_fnmadd_ps:
   4674 ; X64:       # %bb.0: # %entry
   4675 ; X64-NEXT:    kmovw %edi, %k1
   4676 ; X64-NEXT:    vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
   4677 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   4678 ; X64-NEXT:    retq
   4679 entry:
   4680   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   4681   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
   4682   %1 = bitcast i8 %__U to <8 x i1>
   4683   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4684   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
   4685   ret <4 x float> %2
   4686 }
   4687 
   4688 define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   4689 ; X86-LABEL: test_mm_maskz_fmadd_ps:
   4690 ; X86:       # %bb.0: # %entry
   4691 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4692 ; X86-NEXT:    kmovw %eax, %k1
   4693 ; X86-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4694 ; X86-NEXT:    retl
   4695 ;
   4696 ; X64-LABEL: test_mm_maskz_fmadd_ps:
   4697 ; X64:       # %bb.0: # %entry
   4698 ; X64-NEXT:    kmovw %edi, %k1
   4699 ; X64-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4700 ; X64-NEXT:    retq
   4701 entry:
   4702   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   4703   %1 = bitcast i8 %__U to <8 x i1>
   4704   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4705   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
   4706   ret <4 x float> %2
   4707 }
   4708 
   4709 define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   4710 ; X86-LABEL: test_mm_maskz_fmsub_ps:
   4711 ; X86:       # %bb.0: # %entry
   4712 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4713 ; X86-NEXT:    kmovw %eax, %k1
   4714 ; X86-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   4715 ; X86-NEXT:    retl
   4716 ;
   4717 ; X64-LABEL: test_mm_maskz_fmsub_ps:
   4718 ; X64:       # %bb.0: # %entry
   4719 ; X64-NEXT:    kmovw %edi, %k1
   4720 ; X64-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   4721 ; X64-NEXT:    retq
   4722 entry:
   4723   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4724   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
   4725   %1 = bitcast i8 %__U to <8 x i1>
   4726   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4727   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
   4728   ret <4 x float> %2
   4729 }
   4730 
   4731 define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   4732 ; X86-LABEL: test_mm_maskz_fnmadd_ps:
   4733 ; X86:       # %bb.0: # %entry
   4734 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4735 ; X86-NEXT:    kmovw %eax, %k1
   4736 ; X86-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   4737 ; X86-NEXT:    retl
   4738 ;
   4739 ; X64-LABEL: test_mm_maskz_fnmadd_ps:
   4740 ; X64:       # %bb.0: # %entry
   4741 ; X64-NEXT:    kmovw %edi, %k1
   4742 ; X64-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   4743 ; X64-NEXT:    retq
   4744 entry:
   4745   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   4746   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
   4747   %1 = bitcast i8 %__U to <8 x i1>
   4748   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4749   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
   4750   ret <4 x float> %2
   4751 }
   4752 
   4753 define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   4754 ; X86-LABEL: test_mm_maskz_fnmsub_ps:
   4755 ; X86:       # %bb.0: # %entry
   4756 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4757 ; X86-NEXT:    kmovw %eax, %k1
   4758 ; X86-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   4759 ; X86-NEXT:    retl
   4760 ;
   4761 ; X64-LABEL: test_mm_maskz_fnmsub_ps:
   4762 ; X64:       # %bb.0: # %entry
   4763 ; X64-NEXT:    kmovw %edi, %k1
   4764 ; X64-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   4765 ; X64-NEXT:    retq
   4766 entry:
   4767   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   4768   %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4769   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
   4770   %1 = bitcast i8 %__U to <8 x i1>
   4771   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4772   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
   4773   ret <4 x float> %2
   4774 }
   4775 
   4776 define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
   4777 ; X86-LABEL: test_mm256_mask_fmadd_ps:
   4778 ; X86:       # %bb.0: # %entry
   4779 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4780 ; X86-NEXT:    kmovw %eax, %k1
   4781 ; X86-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
   4782 ; X86-NEXT:    retl
   4783 ;
   4784 ; X64-LABEL: test_mm256_mask_fmadd_ps:
   4785 ; X64:       # %bb.0: # %entry
   4786 ; X64-NEXT:    kmovw %edi, %k1
   4787 ; X64-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
   4788 ; X64-NEXT:    retq
   4789 entry:
   4790   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   4791   %1 = bitcast i8 %__U to <8 x i1>
   4792   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
   4793   ret <8 x float> %2
   4794 }
   4795 
   4796 define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
   4797 ; X86-LABEL: test_mm256_mask_fmsub_ps:
   4798 ; X86:       # %bb.0: # %entry
   4799 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4800 ; X86-NEXT:    kmovw %eax, %k1
   4801 ; X86-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
   4802 ; X86-NEXT:    retl
   4803 ;
   4804 ; X64-LABEL: test_mm256_mask_fmsub_ps:
   4805 ; X64:       # %bb.0: # %entry
   4806 ; X64-NEXT:    kmovw %edi, %k1
   4807 ; X64-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
   4808 ; X64-NEXT:    retq
   4809 entry:
   4810   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4811   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
   4812   %1 = bitcast i8 %__U to <8 x i1>
   4813   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
   4814   ret <8 x float> %2
   4815 }
   4816 
   4817 define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
   4818 ; X86-LABEL: test_mm256_mask3_fmadd_ps:
   4819 ; X86:       # %bb.0: # %entry
   4820 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4821 ; X86-NEXT:    kmovw %eax, %k1
   4822 ; X86-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
   4823 ; X86-NEXT:    vmovaps %ymm2, %ymm0
   4824 ; X86-NEXT:    retl
   4825 ;
   4826 ; X64-LABEL: test_mm256_mask3_fmadd_ps:
   4827 ; X64:       # %bb.0: # %entry
   4828 ; X64-NEXT:    kmovw %edi, %k1
   4829 ; X64-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
   4830 ; X64-NEXT:    vmovaps %ymm2, %ymm0
   4831 ; X64-NEXT:    retq
   4832 entry:
   4833   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   4834   %1 = bitcast i8 %__U to <8 x i1>
   4835   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
   4836   ret <8 x float> %2
   4837 }
   4838 
   4839 define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
   4840 ; X86-LABEL: test_mm256_mask3_fnmadd_ps:
   4841 ; X86:       # %bb.0: # %entry
   4842 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4843 ; X86-NEXT:    kmovw %eax, %k1
   4844 ; X86-NEXT:    vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
   4845 ; X86-NEXT:    vmovaps %ymm2, %ymm0
   4846 ; X86-NEXT:    retl
   4847 ;
   4848 ; X64-LABEL: test_mm256_mask3_fnmadd_ps:
   4849 ; X64:       # %bb.0: # %entry
   4850 ; X64-NEXT:    kmovw %edi, %k1
   4851 ; X64-NEXT:    vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
   4852 ; X64-NEXT:    vmovaps %ymm2, %ymm0
   4853 ; X64-NEXT:    retq
   4854 entry:
   4855   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   4856   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
   4857   %1 = bitcast i8 %__U to <8 x i1>
   4858   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
   4859   ret <8 x float> %2
   4860 }
   4861 
   4862 define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
   4863 ; X86-LABEL: test_mm256_maskz_fmadd_ps:
   4864 ; X86:       # %bb.0: # %entry
   4865 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4866 ; X86-NEXT:    kmovw %eax, %k1
   4867 ; X86-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
   4868 ; X86-NEXT:    retl
   4869 ;
   4870 ; X64-LABEL: test_mm256_maskz_fmadd_ps:
   4871 ; X64:       # %bb.0: # %entry
   4872 ; X64-NEXT:    kmovw %edi, %k1
   4873 ; X64-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
   4874 ; X64-NEXT:    retq
   4875 entry:
   4876   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   4877   %1 = bitcast i8 %__U to <8 x i1>
   4878   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
   4879   ret <8 x float> %2
   4880 }
   4881 
   4882 define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
   4883 ; X86-LABEL: test_mm256_maskz_fmsub_ps:
   4884 ; X86:       # %bb.0: # %entry
   4885 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4886 ; X86-NEXT:    kmovw %eax, %k1
   4887 ; X86-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
   4888 ; X86-NEXT:    retl
   4889 ;
   4890 ; X64-LABEL: test_mm256_maskz_fmsub_ps:
   4891 ; X64:       # %bb.0: # %entry
   4892 ; X64-NEXT:    kmovw %edi, %k1
   4893 ; X64-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
   4894 ; X64-NEXT:    retq
   4895 entry:
   4896   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4897   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
   4898   %1 = bitcast i8 %__U to <8 x i1>
   4899   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
   4900   ret <8 x float> %2
   4901 }
   4902 
   4903 define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
   4904 ; X86-LABEL: test_mm256_maskz_fnmadd_ps:
   4905 ; X86:       # %bb.0: # %entry
   4906 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4907 ; X86-NEXT:    kmovw %eax, %k1
   4908 ; X86-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
   4909 ; X86-NEXT:    retl
   4910 ;
   4911 ; X64-LABEL: test_mm256_maskz_fnmadd_ps:
   4912 ; X64:       # %bb.0: # %entry
   4913 ; X64-NEXT:    kmovw %edi, %k1
   4914 ; X64-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
   4915 ; X64-NEXT:    retq
   4916 entry:
   4917   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   4918   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
   4919   %1 = bitcast i8 %__U to <8 x i1>
   4920   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
   4921   ret <8 x float> %2
   4922 }
   4923 
   4924 define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
   4925 ; X86-LABEL: test_mm256_maskz_fnmsub_ps:
   4926 ; X86:       # %bb.0: # %entry
   4927 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4928 ; X86-NEXT:    kmovw %eax, %k1
   4929 ; X86-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
   4930 ; X86-NEXT:    retl
   4931 ;
   4932 ; X64-LABEL: test_mm256_maskz_fnmsub_ps:
   4933 ; X64:       # %bb.0: # %entry
   4934 ; X64-NEXT:    kmovw %edi, %k1
   4935 ; X64-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
   4936 ; X64-NEXT:    retq
   4937 entry:
   4938   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   4939   %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4940   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
   4941   %1 = bitcast i8 %__U to <8 x i1>
   4942   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
   4943   ret <8 x float> %2
   4944 }
   4945 
   4946 define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
   4947 ; X86-LABEL: test_mm_mask_fmaddsub_pd:
   4948 ; X86:       # %bb.0: # %entry
   4949 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4950 ; X86-NEXT:    kmovw %eax, %k1
   4951 ; X86-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
   4952 ; X86-NEXT:    retl
   4953 ;
   4954 ; X64-LABEL: test_mm_mask_fmaddsub_pd:
   4955 ; X64:       # %bb.0: # %entry
   4956 ; X64-NEXT:    kmovw %edi, %k1
   4957 ; X64-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
   4958 ; X64-NEXT:    retq
   4959 entry:
   4960   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   4961   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   4962   %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
   4963   %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
   4964   %4 = bitcast i8 %__U to <8 x i1>
   4965   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4966   %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
   4967   ret <2 x double> %5
   4968 }
   4969 
   4970 define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
   4971 ; X86-LABEL: test_mm_mask_fmsubadd_pd:
   4972 ; X86:       # %bb.0: # %entry
   4973 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4974 ; X86-NEXT:    kmovw %eax, %k1
   4975 ; X86-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
   4976 ; X86-NEXT:    retl
   4977 ;
   4978 ; X64-LABEL: test_mm_mask_fmsubadd_pd:
   4979 ; X64:       # %bb.0: # %entry
   4980 ; X64-NEXT:    kmovw %edi, %k1
   4981 ; X64-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
   4982 ; X64-NEXT:    retq
   4983 entry:
   4984   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   4985   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
   4986   %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   4987   %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
   4988   %3 = bitcast i8 %__U to <8 x i1>
   4989   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   4990   %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
   4991   ret <2 x double> %4
   4992 }
   4993 
   4994 define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
   4995 ; X86-LABEL: test_mm_mask3_fmaddsub_pd:
   4996 ; X86:       # %bb.0: # %entry
   4997 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4998 ; X86-NEXT:    kmovw %eax, %k1
   4999 ; X86-NEXT:    vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
   5000 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5001 ; X86-NEXT:    retl
   5002 ;
   5003 ; X64-LABEL: test_mm_mask3_fmaddsub_pd:
   5004 ; X64:       # %bb.0: # %entry
   5005 ; X64-NEXT:    kmovw %edi, %k1
   5006 ; X64-NEXT:    vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
   5007 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5008 ; X64-NEXT:    retq
   5009 entry:
   5010   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   5011   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   5012   %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
   5013   %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
   5014   %4 = bitcast i8 %__U to <8 x i1>
   5015   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5016   %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
   5017   ret <2 x double> %5
   5018 }
   5019 
   5020 define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   5021 ; X86-LABEL: test_mm_maskz_fmaddsub_pd:
   5022 ; X86:       # %bb.0: # %entry
   5023 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5024 ; X86-NEXT:    kmovw %eax, %k1
   5025 ; X86-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
   5026 ; X86-NEXT:    retl
   5027 ;
   5028 ; X64-LABEL: test_mm_maskz_fmaddsub_pd:
   5029 ; X64:       # %bb.0: # %entry
   5030 ; X64-NEXT:    kmovw %edi, %k1
   5031 ; X64-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
   5032 ; X64-NEXT:    retq
   5033 entry:
   5034   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   5035   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   5036   %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
   5037   %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
   5038   %4 = bitcast i8 %__U to <8 x i1>
   5039   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5040   %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
   5041   ret <2 x double> %5
   5042 }
   5043 
   5044 define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   5045 ; X86-LABEL: test_mm_maskz_fmsubadd_pd:
   5046 ; X86:       # %bb.0: # %entry
   5047 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5048 ; X86-NEXT:    kmovw %eax, %k1
   5049 ; X86-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
   5050 ; X86-NEXT:    retl
   5051 ;
   5052 ; X64-LABEL: test_mm_maskz_fmsubadd_pd:
   5053 ; X64:       # %bb.0: # %entry
   5054 ; X64-NEXT:    kmovw %edi, %k1
   5055 ; X64-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
   5056 ; X64-NEXT:    retq
   5057 entry:
   5058   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   5059   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
   5060   %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   5061   %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
   5062   %3 = bitcast i8 %__U to <8 x i1>
   5063   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5064   %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
   5065   ret <2 x double> %4
   5066 }
   5067 
   5068 define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
   5069 ; X86-LABEL: test_mm256_mask_fmaddsub_pd:
   5070 ; X86:       # %bb.0: # %entry
   5071 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5072 ; X86-NEXT:    kmovw %eax, %k1
   5073 ; X86-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
   5074 ; X86-NEXT:    retl
   5075 ;
   5076 ; X64-LABEL: test_mm256_mask_fmaddsub_pd:
   5077 ; X64:       # %bb.0: # %entry
   5078 ; X64-NEXT:    kmovw %edi, %k1
   5079 ; X64-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
   5080 ; X64-NEXT:    retq
   5081 entry:
   5082   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   5083   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5084   %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
   5085   %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5086   %4 = bitcast i8 %__U to <8 x i1>
   5087   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5088   %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
   5089   ret <4 x double> %5
   5090 }
   5091 
   5092 define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
   5093 ; X86-LABEL: test_mm256_mask_fmsubadd_pd:
   5094 ; X86:       # %bb.0: # %entry
   5095 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5096 ; X86-NEXT:    kmovw %eax, %k1
   5097 ; X86-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
   5098 ; X86-NEXT:    retl
   5099 ;
   5100 ; X64-LABEL: test_mm256_mask_fmsubadd_pd:
   5101 ; X64:       # %bb.0: # %entry
   5102 ; X64-NEXT:    kmovw %edi, %k1
   5103 ; X64-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
   5104 ; X64-NEXT:    retq
   5105 entry:
   5106   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5107   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
   5108   %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   5109   %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5110   %3 = bitcast i8 %__U to <8 x i1>
   5111   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5112   %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
   5113   ret <4 x double> %4
   5114 }
   5115 
   5116 define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
   5117 ; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
   5118 ; X86:       # %bb.0: # %entry
   5119 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5120 ; X86-NEXT:    kmovw %eax, %k1
   5121 ; X86-NEXT:    vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
   5122 ; X86-NEXT:    vmovapd %ymm2, %ymm0
   5123 ; X86-NEXT:    retl
   5124 ;
   5125 ; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
   5126 ; X64:       # %bb.0: # %entry
   5127 ; X64-NEXT:    kmovw %edi, %k1
   5128 ; X64-NEXT:    vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
   5129 ; X64-NEXT:    vmovapd %ymm2, %ymm0
   5130 ; X64-NEXT:    retq
   5131 entry:
   5132   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   5133   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5134   %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
   5135   %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5136   %4 = bitcast i8 %__U to <8 x i1>
   5137   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5138   %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
   5139   ret <4 x double> %5
   5140 }
   5141 
   5142 define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
   5143 ; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
   5144 ; X86:       # %bb.0: # %entry
   5145 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5146 ; X86-NEXT:    kmovw %eax, %k1
   5147 ; X86-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
   5148 ; X86-NEXT:    retl
   5149 ;
   5150 ; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
   5151 ; X64:       # %bb.0: # %entry
   5152 ; X64-NEXT:    kmovw %edi, %k1
   5153 ; X64-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
   5154 ; X64-NEXT:    retq
   5155 entry:
   5156   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   5157   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5158   %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
   5159   %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5160   %4 = bitcast i8 %__U to <8 x i1>
   5161   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5162   %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
   5163   ret <4 x double> %5
   5164 }
   5165 
   5166 define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
   5167 ; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
   5168 ; X86:       # %bb.0: # %entry
   5169 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5170 ; X86-NEXT:    kmovw %eax, %k1
   5171 ; X86-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
   5172 ; X86-NEXT:    retl
   5173 ;
   5174 ; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
   5175 ; X64:       # %bb.0: # %entry
   5176 ; X64-NEXT:    kmovw %edi, %k1
   5177 ; X64-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
   5178 ; X64-NEXT:    retq
   5179 entry:
   5180   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5181   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
   5182   %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   5183   %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5184   %3 = bitcast i8 %__U to <8 x i1>
   5185   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5186   %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
   5187   ret <4 x double> %4
   5188 }
   5189 
   5190 define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
   5191 ; X86-LABEL: test_mm_mask_fmaddsub_ps:
   5192 ; X86:       # %bb.0: # %entry
   5193 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5194 ; X86-NEXT:    kmovw %eax, %k1
   5195 ; X86-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
   5196 ; X86-NEXT:    retl
   5197 ;
   5198 ; X64-LABEL: test_mm_mask_fmaddsub_ps:
   5199 ; X64:       # %bb.0: # %entry
   5200 ; X64-NEXT:    kmovw %edi, %k1
   5201 ; X64-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
   5202 ; X64-NEXT:    retq
   5203 entry:
   5204   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   5205   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5206   %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
   5207   %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5208   %4 = bitcast i8 %__U to <8 x i1>
   5209   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5210   %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
   5211   ret <4 x float> %5
   5212 }
   5213 
   5214 define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
   5215 ; X86-LABEL: test_mm_mask_fmsubadd_ps:
   5216 ; X86:       # %bb.0: # %entry
   5217 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5218 ; X86-NEXT:    kmovw %eax, %k1
   5219 ; X86-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
   5220 ; X86-NEXT:    retl
   5221 ;
   5222 ; X64-LABEL: test_mm_mask_fmsubadd_ps:
   5223 ; X64:       # %bb.0: # %entry
   5224 ; X64-NEXT:    kmovw %edi, %k1
   5225 ; X64-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
   5226 ; X64-NEXT:    retq
   5227 entry:
   5228   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5229   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
   5230   %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   5231   %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5232   %3 = bitcast i8 %__U to <8 x i1>
   5233   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5234   %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
   5235   ret <4 x float> %4
   5236 }
   5237 
   5238 define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
   5239 ; X86-LABEL: test_mm_mask3_fmaddsub_ps:
   5240 ; X86:       # %bb.0: # %entry
   5241 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5242 ; X86-NEXT:    kmovw %eax, %k1
   5243 ; X86-NEXT:    vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
   5244 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5245 ; X86-NEXT:    retl
   5246 ;
   5247 ; X64-LABEL: test_mm_mask3_fmaddsub_ps:
   5248 ; X64:       # %bb.0: # %entry
   5249 ; X64-NEXT:    kmovw %edi, %k1
   5250 ; X64-NEXT:    vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
   5251 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5252 ; X64-NEXT:    retq
   5253 entry:
   5254   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   5255   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5256   %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
   5257   %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5258   %4 = bitcast i8 %__U to <8 x i1>
   5259   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5260   %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
   5261   ret <4 x float> %5
   5262 }
   5263 
   5264 define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   5265 ; X86-LABEL: test_mm_maskz_fmaddsub_ps:
   5266 ; X86:       # %bb.0: # %entry
   5267 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5268 ; X86-NEXT:    kmovw %eax, %k1
   5269 ; X86-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
   5270 ; X86-NEXT:    retl
   5271 ;
   5272 ; X64-LABEL: test_mm_maskz_fmaddsub_ps:
   5273 ; X64:       # %bb.0: # %entry
   5274 ; X64-NEXT:    kmovw %edi, %k1
   5275 ; X64-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
   5276 ; X64-NEXT:    retq
   5277 entry:
   5278   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   5279   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5280   %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
   5281   %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5282   %4 = bitcast i8 %__U to <8 x i1>
   5283   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5284   %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
   5285   ret <4 x float> %5
   5286 }
   5287 
   5288 define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   5289 ; X86-LABEL: test_mm_maskz_fmsubadd_ps:
   5290 ; X86:       # %bb.0: # %entry
   5291 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5292 ; X86-NEXT:    kmovw %eax, %k1
   5293 ; X86-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
   5294 ; X86-NEXT:    retl
   5295 ;
   5296 ; X64-LABEL: test_mm_maskz_fmsubadd_ps:
   5297 ; X64:       # %bb.0: # %entry
   5298 ; X64-NEXT:    kmovw %edi, %k1
   5299 ; X64-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
   5300 ; X64-NEXT:    retq
   5301 entry:
   5302   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5303   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
   5304   %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   5305   %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5306   %3 = bitcast i8 %__U to <8 x i1>
   5307   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5308   %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
   5309   ret <4 x float> %4
   5310 }
   5311 
   5312 define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
   5313 ; X86-LABEL: test_mm256_mask_fmaddsub_ps:
   5314 ; X86:       # %bb.0: # %entry
   5315 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5316 ; X86-NEXT:    kmovw %eax, %k1
   5317 ; X86-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
   5318 ; X86-NEXT:    retl
   5319 ;
   5320 ; X64-LABEL: test_mm256_mask_fmaddsub_ps:
   5321 ; X64:       # %bb.0: # %entry
   5322 ; X64-NEXT:    kmovw %edi, %k1
   5323 ; X64-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
   5324 ; X64-NEXT:    retq
   5325 entry:
   5326   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   5327   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5328   %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
   5329   %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   5330   %4 = bitcast i8 %__U to <8 x i1>
   5331   %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
   5332   ret <8 x float> %5
   5333 }
   5334 
   5335 define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
   5336 ; X86-LABEL: test_mm256_mask_fmsubadd_ps:
   5337 ; X86:       # %bb.0: # %entry
   5338 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5339 ; X86-NEXT:    kmovw %eax, %k1
   5340 ; X86-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
   5341 ; X86-NEXT:    retl
   5342 ;
   5343 ; X64-LABEL: test_mm256_mask_fmsubadd_ps:
   5344 ; X64:       # %bb.0: # %entry
   5345 ; X64-NEXT:    kmovw %edi, %k1
   5346 ; X64-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
   5347 ; X64-NEXT:    retq
   5348 entry:
   5349   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5350   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
   5351   %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   5352   %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   5353   %3 = bitcast i8 %__U to <8 x i1>
   5354   %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
   5355   ret <8 x float> %4
   5356 }
   5357 
   5358 define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
   5359 ; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
   5360 ; X86:       # %bb.0: # %entry
   5361 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5362 ; X86-NEXT:    kmovw %eax, %k1
   5363 ; X86-NEXT:    vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
   5364 ; X86-NEXT:    vmovaps %ymm2, %ymm0
   5365 ; X86-NEXT:    retl
   5366 ;
   5367 ; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
   5368 ; X64:       # %bb.0: # %entry
   5369 ; X64-NEXT:    kmovw %edi, %k1
   5370 ; X64-NEXT:    vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
   5371 ; X64-NEXT:    vmovaps %ymm2, %ymm0
   5372 ; X64-NEXT:    retq
   5373 entry:
   5374   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   5375   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5376   %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
   5377   %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   5378   %4 = bitcast i8 %__U to <8 x i1>
   5379   %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
   5380   ret <8 x float> %5
   5381 }
   5382 
   5383 define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
   5384 ; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
   5385 ; X86:       # %bb.0: # %entry
   5386 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5387 ; X86-NEXT:    kmovw %eax, %k1
   5388 ; X86-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
   5389 ; X86-NEXT:    retl
   5390 ;
   5391 ; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
   5392 ; X64:       # %bb.0: # %entry
   5393 ; X64-NEXT:    kmovw %edi, %k1
   5394 ; X64-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
   5395 ; X64-NEXT:    retq
   5396 entry:
   5397   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   5398   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5399   %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
   5400   %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   5401   %4 = bitcast i8 %__U to <8 x i1>
   5402   %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
   5403   ret <8 x float> %5
   5404 }
   5405 
   5406 define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
   5407 ; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
   5408 ; X86:       # %bb.0: # %entry
   5409 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5410 ; X86-NEXT:    kmovw %eax, %k1
   5411 ; X86-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
   5412 ; X86-NEXT:    retl
   5413 ;
   5414 ; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
   5415 ; X64:       # %bb.0: # %entry
   5416 ; X64-NEXT:    kmovw %edi, %k1
   5417 ; X64-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
   5418 ; X64-NEXT:    retq
   5419 entry:
   5420   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5421   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
   5422   %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   5423   %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   5424   %3 = bitcast i8 %__U to <8 x i1>
   5425   %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
   5426   ret <8 x float> %4
   5427 }
   5428 
   5429 define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
   5430 ; X86-LABEL: test_mm_mask3_fmsub_pd:
   5431 ; X86:       # %bb.0: # %entry
   5432 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5433 ; X86-NEXT:    kmovw %eax, %k1
   5434 ; X86-NEXT:    vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
   5435 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5436 ; X86-NEXT:    retl
   5437 ;
   5438 ; X64-LABEL: test_mm_mask3_fmsub_pd:
   5439 ; X64:       # %bb.0: # %entry
   5440 ; X64-NEXT:    kmovw %edi, %k1
   5441 ; X64-NEXT:    vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
   5442 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5443 ; X64-NEXT:    retq
   5444 entry:
   5445   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   5446   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
   5447   %1 = bitcast i8 %__U to <8 x i1>
   5448   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5449   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
   5450   ret <2 x double> %2
   5451 }
   5452 
   5453 define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
   5454 ; X86-LABEL: test_mm256_mask3_fmsub_pd:
   5455 ; X86:       # %bb.0: # %entry
   5456 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5457 ; X86-NEXT:    kmovw %eax, %k1
   5458 ; X86-NEXT:    vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
   5459 ; X86-NEXT:    vmovapd %ymm2, %ymm0
   5460 ; X86-NEXT:    retl
   5461 ;
   5462 ; X64-LABEL: test_mm256_mask3_fmsub_pd:
   5463 ; X64:       # %bb.0: # %entry
   5464 ; X64-NEXT:    kmovw %edi, %k1
   5465 ; X64-NEXT:    vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
   5466 ; X64-NEXT:    vmovapd %ymm2, %ymm0
   5467 ; X64-NEXT:    retq
   5468 entry:
   5469   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5470   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
   5471   %1 = bitcast i8 %__U to <8 x i1>
   5472   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5473   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
   5474   ret <4 x double> %2
   5475 }
   5476 
   5477 define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
   5478 ; X86-LABEL: test_mm_mask3_fmsub_ps:
   5479 ; X86:       # %bb.0: # %entry
   5480 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5481 ; X86-NEXT:    kmovw %eax, %k1
   5482 ; X86-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
   5483 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5484 ; X86-NEXT:    retl
   5485 ;
   5486 ; X64-LABEL: test_mm_mask3_fmsub_ps:
   5487 ; X64:       # %bb.0: # %entry
   5488 ; X64-NEXT:    kmovw %edi, %k1
   5489 ; X64-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
   5490 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5491 ; X64-NEXT:    retq
   5492 entry:
   5493   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5494   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
   5495   %1 = bitcast i8 %__U to <8 x i1>
   5496   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5497   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
   5498   ret <4 x float> %2
   5499 }
   5500 
   5501 define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
   5502 ; X86-LABEL: test_mm256_mask3_fmsub_ps:
   5503 ; X86:       # %bb.0: # %entry
   5504 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5505 ; X86-NEXT:    kmovw %eax, %k1
   5506 ; X86-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
   5507 ; X86-NEXT:    vmovaps %ymm2, %ymm0
   5508 ; X86-NEXT:    retl
   5509 ;
   5510 ; X64-LABEL: test_mm256_mask3_fmsub_ps:
   5511 ; X64:       # %bb.0: # %entry
   5512 ; X64-NEXT:    kmovw %edi, %k1
   5513 ; X64-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
   5514 ; X64-NEXT:    vmovaps %ymm2, %ymm0
   5515 ; X64-NEXT:    retq
   5516 entry:
   5517   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5518   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
   5519   %1 = bitcast i8 %__U to <8 x i1>
   5520   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
   5521   ret <8 x float> %2
   5522 }
   5523 
   5524 define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
   5525 ; X86-LABEL: test_mm_mask3_fmsubadd_pd:
   5526 ; X86:       # %bb.0: # %entry
   5527 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5528 ; X86-NEXT:    kmovw %eax, %k1
   5529 ; X86-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
   5530 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5531 ; X86-NEXT:    retl
   5532 ;
   5533 ; X64-LABEL: test_mm_mask3_fmsubadd_pd:
   5534 ; X64:       # %bb.0: # %entry
   5535 ; X64-NEXT:    kmovw %edi, %k1
   5536 ; X64-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
   5537 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5538 ; X64-NEXT:    retq
   5539 entry:
   5540   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   5541   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
   5542   %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
   5543   %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
   5544   %3 = bitcast i8 %__U to <8 x i1>
   5545   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5546   %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
   5547   ret <2 x double> %4
   5548 }
   5549 
   5550 define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
   5551 ; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
   5552 ; X86:       # %bb.0: # %entry
   5553 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5554 ; X86-NEXT:    kmovw %eax, %k1
   5555 ; X86-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
   5556 ; X86-NEXT:    vmovapd %ymm2, %ymm0
   5557 ; X86-NEXT:    retl
   5558 ;
   5559 ; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
   5560 ; X64:       # %bb.0: # %entry
   5561 ; X64-NEXT:    kmovw %edi, %k1
   5562 ; X64-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
   5563 ; X64-NEXT:    vmovapd %ymm2, %ymm0
   5564 ; X64-NEXT:    retq
   5565 entry:
   5566   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5567   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
   5568   %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
   5569   %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5570   %3 = bitcast i8 %__U to <8 x i1>
   5571   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5572   %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
   5573   ret <4 x double> %4
   5574 }
   5575 
   5576 define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
   5577 ; X86-LABEL: test_mm_mask3_fmsubadd_ps:
   5578 ; X86:       # %bb.0: # %entry
   5579 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5580 ; X86-NEXT:    kmovw %eax, %k1
   5581 ; X86-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
   5582 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5583 ; X86-NEXT:    retl
   5584 ;
   5585 ; X64-LABEL: test_mm_mask3_fmsubadd_ps:
   5586 ; X64:       # %bb.0: # %entry
   5587 ; X64-NEXT:    kmovw %edi, %k1
   5588 ; X64-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
   5589 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5590 ; X64-NEXT:    retq
   5591 entry:
   5592   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5593   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
   5594   %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
   5595   %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   5596   %3 = bitcast i8 %__U to <8 x i1>
   5597   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5598   %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
   5599   ret <4 x float> %4
   5600 }
   5601 
   5602 define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
   5603 ; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
   5604 ; X86:       # %bb.0: # %entry
   5605 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5606 ; X86-NEXT:    kmovw %eax, %k1
   5607 ; X86-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
   5608 ; X86-NEXT:    vmovaps %ymm2, %ymm0
   5609 ; X86-NEXT:    retl
   5610 ;
   5611 ; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
   5612 ; X64:       # %bb.0: # %entry
   5613 ; X64-NEXT:    kmovw %edi, %k1
   5614 ; X64-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
   5615 ; X64-NEXT:    vmovaps %ymm2, %ymm0
   5616 ; X64-NEXT:    retq
   5617 entry:
   5618   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5619   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
   5620   %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
   5621   %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   5622   %3 = bitcast i8 %__U to <8 x i1>
   5623   %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
   5624   ret <8 x float> %4
   5625 }
   5626 
   5627 define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
   5628 ; X86-LABEL: test_mm_mask_fnmadd_pd:
   5629 ; X86:       # %bb.0: # %entry
   5630 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5631 ; X86-NEXT:    kmovw %eax, %k1
   5632 ; X86-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
   5633 ; X86-NEXT:    retl
   5634 ;
   5635 ; X64-LABEL: test_mm_mask_fnmadd_pd:
   5636 ; X64:       # %bb.0: # %entry
   5637 ; X64-NEXT:    kmovw %edi, %k1
   5638 ; X64-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
   5639 ; X64-NEXT:    retq
   5640 entry:
   5641   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
   5642   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
   5643   %1 = bitcast i8 %__U to <8 x i1>
   5644   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5645   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
   5646   ret <2 x double> %2
   5647 }
   5648 
   5649 define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
   5650 ; X86-LABEL: test_mm256_mask_fnmadd_pd:
   5651 ; X86:       # %bb.0: # %entry
   5652 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5653 ; X86-NEXT:    kmovw %eax, %k1
   5654 ; X86-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
   5655 ; X86-NEXT:    retl
   5656 ;
   5657 ; X64-LABEL: test_mm256_mask_fnmadd_pd:
   5658 ; X64:       # %bb.0: # %entry
   5659 ; X64-NEXT:    kmovw %edi, %k1
   5660 ; X64-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
   5661 ; X64-NEXT:    retq
   5662 entry:
   5663   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
   5664   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
   5665   %1 = bitcast i8 %__U to <8 x i1>
   5666   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5667   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
   5668   ret <4 x double> %2
   5669 }
   5670 
   5671 define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
   5672 ; X86-LABEL: test_mm_mask_fnmadd_ps:
   5673 ; X86:       # %bb.0: # %entry
   5674 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5675 ; X86-NEXT:    kmovw %eax, %k1
   5676 ; X86-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
   5677 ; X86-NEXT:    retl
   5678 ;
   5679 ; X64-LABEL: test_mm_mask_fnmadd_ps:
   5680 ; X64:       # %bb.0: # %entry
   5681 ; X64-NEXT:    kmovw %edi, %k1
   5682 ; X64-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
   5683 ; X64-NEXT:    retq
   5684 entry:
   5685   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   5686   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
   5687   %1 = bitcast i8 %__U to <8 x i1>
   5688   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5689   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
   5690   ret <4 x float> %2
   5691 }
   5692 
   5693 define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
   5694 ; X86-LABEL: test_mm256_mask_fnmadd_ps:
   5695 ; X86:       # %bb.0: # %entry
   5696 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5697 ; X86-NEXT:    kmovw %eax, %k1
   5698 ; X86-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
   5699 ; X86-NEXT:    retl
   5700 ;
   5701 ; X64-LABEL: test_mm256_mask_fnmadd_ps:
   5702 ; X64:       # %bb.0: # %entry
   5703 ; X64-NEXT:    kmovw %edi, %k1
   5704 ; X64-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
   5705 ; X64-NEXT:    retq
   5706 entry:
   5707   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   5708   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
   5709   %1 = bitcast i8 %__U to <8 x i1>
   5710   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
   5711   ret <8 x float> %2
   5712 }
   5713 
   5714 define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
   5715 ; X86-LABEL: test_mm_mask_fnmsub_pd:
   5716 ; X86:       # %bb.0: # %entry
   5717 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5718 ; X86-NEXT:    kmovw %eax, %k1
   5719 ; X86-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
   5720 ; X86-NEXT:    retl
   5721 ;
   5722 ; X64-LABEL: test_mm_mask_fnmsub_pd:
   5723 ; X64:       # %bb.0: # %entry
   5724 ; X64-NEXT:    kmovw %edi, %k1
   5725 ; X64-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
   5726 ; X64-NEXT:    retq
   5727 entry:
   5728   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
   5729   %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   5730   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
   5731   %1 = bitcast i8 %__U to <8 x i1>
   5732   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5733   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
   5734   ret <2 x double> %2
   5735 }
   5736 
   5737 define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
   5738 ; X86-LABEL: test_mm_mask3_fnmsub_pd:
   5739 ; X86:       # %bb.0: # %entry
   5740 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5741 ; X86-NEXT:    kmovw %eax, %k1
   5742 ; X86-NEXT:    vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
   5743 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5744 ; X86-NEXT:    retl
   5745 ;
   5746 ; X64-LABEL: test_mm_mask3_fnmsub_pd:
   5747 ; X64:       # %bb.0: # %entry
   5748 ; X64-NEXT:    kmovw %edi, %k1
   5749 ; X64-NEXT:    vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
   5750 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5751 ; X64-NEXT:    retq
   5752 entry:
   5753   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
   5754   %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
   5755   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
   5756   %1 = bitcast i8 %__U to <8 x i1>
   5757   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5758   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
   5759   ret <2 x double> %2
   5760 }
   5761 
   5762 define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
   5763 ; X86-LABEL: test_mm256_mask_fnmsub_pd:
   5764 ; X86:       # %bb.0: # %entry
   5765 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5766 ; X86-NEXT:    kmovw %eax, %k1
   5767 ; X86-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
   5768 ; X86-NEXT:    retl
   5769 ;
   5770 ; X64-LABEL: test_mm256_mask_fnmsub_pd:
   5771 ; X64:       # %bb.0: # %entry
   5772 ; X64-NEXT:    kmovw %edi, %k1
   5773 ; X64-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
   5774 ; X64-NEXT:    retq
   5775 entry:
   5776   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
   5777   %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5778   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
   5779   %1 = bitcast i8 %__U to <8 x i1>
   5780   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5781   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
   5782   ret <4 x double> %2
   5783 }
   5784 
   5785 define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
   5786 ; X86-LABEL: test_mm256_mask3_fnmsub_pd:
   5787 ; X86:       # %bb.0: # %entry
   5788 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5789 ; X86-NEXT:    kmovw %eax, %k1
   5790 ; X86-NEXT:    vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
   5791 ; X86-NEXT:    vmovapd %ymm2, %ymm0
   5792 ; X86-NEXT:    retl
   5793 ;
   5794 ; X64-LABEL: test_mm256_mask3_fnmsub_pd:
   5795 ; X64:       # %bb.0: # %entry
   5796 ; X64-NEXT:    kmovw %edi, %k1
   5797 ; X64-NEXT:    vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
   5798 ; X64-NEXT:    vmovapd %ymm2, %ymm0
   5799 ; X64-NEXT:    retq
   5800 entry:
   5801   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
   5802   %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   5803   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
   5804   %1 = bitcast i8 %__U to <8 x i1>
   5805   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5806   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
   5807   ret <4 x double> %2
   5808 }
   5809 
   5810 define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
   5811 ; X86-LABEL: test_mm_mask_fnmsub_ps:
   5812 ; X86:       # %bb.0: # %entry
   5813 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5814 ; X86-NEXT:    kmovw %eax, %k1
   5815 ; X86-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
   5816 ; X86-NEXT:    retl
   5817 ;
   5818 ; X64-LABEL: test_mm_mask_fnmsub_ps:
   5819 ; X64:       # %bb.0: # %entry
   5820 ; X64-NEXT:    kmovw %edi, %k1
   5821 ; X64-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
   5822 ; X64-NEXT:    retq
   5823 entry:
   5824   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   5825   %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5826   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
   5827   %1 = bitcast i8 %__U to <8 x i1>
   5828   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5829   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
   5830   ret <4 x float> %2
   5831 }
   5832 
   5833 define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
   5834 ; X86-LABEL: test_mm_mask3_fnmsub_ps:
   5835 ; X86:       # %bb.0: # %entry
   5836 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5837 ; X86-NEXT:    kmovw %eax, %k1
   5838 ; X86-NEXT:    vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
   5839 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5840 ; X86-NEXT:    retl
   5841 ;
   5842 ; X64-LABEL: test_mm_mask3_fnmsub_ps:
   5843 ; X64:       # %bb.0: # %entry
   5844 ; X64-NEXT:    kmovw %edi, %k1
   5845 ; X64-NEXT:    vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
   5846 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5847 ; X64-NEXT:    retq
   5848 entry:
   5849   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   5850   %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5851   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
   5852   %1 = bitcast i8 %__U to <8 x i1>
   5853   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5854   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
   5855   ret <4 x float> %2
   5856 }
   5857 
   5858 define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
   5859 ; X86-LABEL: test_mm256_mask_fnmsub_ps:
   5860 ; X86:       # %bb.0: # %entry
   5861 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5862 ; X86-NEXT:    kmovw %eax, %k1
   5863 ; X86-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
   5864 ; X86-NEXT:    retl
   5865 ;
   5866 ; X64-LABEL: test_mm256_mask_fnmsub_ps:
   5867 ; X64:       # %bb.0: # %entry
   5868 ; X64-NEXT:    kmovw %edi, %k1
   5869 ; X64-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
   5870 ; X64-NEXT:    retq
   5871 entry:
   5872   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   5873   %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5874   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
   5875   %1 = bitcast i8 %__U to <8 x i1>
   5876   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
   5877   ret <8 x float> %2
   5878 }
   5879 
   5880 define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
   5881 ; X86-LABEL: test_mm256_mask3_fnmsub_ps:
   5882 ; X86:       # %bb.0: # %entry
   5883 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5884 ; X86-NEXT:    kmovw %eax, %k1
   5885 ; X86-NEXT:    vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
   5886 ; X86-NEXT:    vmovaps %ymm2, %ymm0
   5887 ; X86-NEXT:    retl
   5888 ;
   5889 ; X64-LABEL: test_mm256_mask3_fnmsub_ps:
   5890 ; X64:       # %bb.0: # %entry
   5891 ; X64-NEXT:    kmovw %edi, %k1
   5892 ; X64-NEXT:    vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
   5893 ; X64-NEXT:    vmovaps %ymm2, %ymm0
   5894 ; X64-NEXT:    retq
   5895 entry:
   5896   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   5897   %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   5898   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
   5899   %1 = bitcast i8 %__U to <8 x i1>
   5900   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
   5901   ret <8 x float> %2
   5902 }
   5903 
   5904 define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
   5905 ; X86-LABEL: test_mm_mask_expandloadu_pd:
   5906 ; X86:       # %bb.0: # %entry
   5907 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   5908 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   5909 ; X86-NEXT:    kmovw %ecx, %k1
   5910 ; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1}
   5911 ; X86-NEXT:    retl
   5912 ;
   5913 ; X64-LABEL: test_mm_mask_expandloadu_pd:
   5914 ; X64:       # %bb.0: # %entry
   5915 ; X64-NEXT:    kmovw %edi, %k1
   5916 ; X64-NEXT:    vexpandpd (%rsi), %xmm0 {%k1}
   5917 ; X64-NEXT:    retq
   5918 entry:
   5919   %0 = bitcast i8* %__P to double*
   5920   %1 = bitcast i8 %__U to <8 x i1>
   5921   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5922   %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W)
   5923   ret <2 x double> %2
   5924 }
   5925 
   5926 define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
   5927 ; X86-LABEL: test_mm_maskz_expandloadu_pd:
   5928 ; X86:       # %bb.0: # %entry
   5929 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   5930 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   5931 ; X86-NEXT:    kmovw %ecx, %k1
   5932 ; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1} {z}
   5933 ; X86-NEXT:    retl
   5934 ;
   5935 ; X64-LABEL: test_mm_maskz_expandloadu_pd:
   5936 ; X64:       # %bb.0: # %entry
   5937 ; X64-NEXT:    kmovw %edi, %k1
   5938 ; X64-NEXT:    vexpandpd (%rsi), %xmm0 {%k1} {z}
   5939 ; X64-NEXT:    retq
   5940 entry:
   5941   %0 = bitcast i8* %__P to double*
   5942   %1 = bitcast i8 %__U to <8 x i1>
   5943   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   5944   %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer)
   5945   ret <2 x double> %2
   5946 }
   5947 
   5948 define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
   5949 ; X86-LABEL: test_mm256_mask_expandloadu_pd:
   5950 ; X86:       # %bb.0: # %entry
   5951 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   5952 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   5953 ; X86-NEXT:    kmovw %ecx, %k1
   5954 ; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1}
   5955 ; X86-NEXT:    retl
   5956 ;
   5957 ; X64-LABEL: test_mm256_mask_expandloadu_pd:
   5958 ; X64:       # %bb.0: # %entry
   5959 ; X64-NEXT:    kmovw %edi, %k1
   5960 ; X64-NEXT:    vexpandpd (%rsi), %ymm0 {%k1}
   5961 ; X64-NEXT:    retq
   5962 entry:
   5963   %0 = bitcast i8* %__P to double*
   5964   %1 = bitcast i8 %__U to <8 x i1>
   5965   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5966   %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W)
   5967   ret <4 x double> %2
   5968 }
   5969 
   5970 define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
   5971 ; X86-LABEL: test_mm256_maskz_expandloadu_pd:
   5972 ; X86:       # %bb.0: # %entry
   5973 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   5974 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   5975 ; X86-NEXT:    kmovw %ecx, %k1
   5976 ; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1} {z}
   5977 ; X86-NEXT:    retl
   5978 ;
   5979 ; X64-LABEL: test_mm256_maskz_expandloadu_pd:
   5980 ; X64:       # %bb.0: # %entry
   5981 ; X64-NEXT:    kmovw %edi, %k1
   5982 ; X64-NEXT:    vexpandpd (%rsi), %ymm0 {%k1} {z}
   5983 ; X64-NEXT:    retq
   5984 entry:
   5985   %0 = bitcast i8* %__P to double*
   5986   %1 = bitcast i8 %__U to <8 x i1>
   5987   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   5988   %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer)
   5989   ret <4 x double> %2
   5990 }
   5991 
   5992 define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
   5993 ; X86-LABEL: test_mm_mask_expandloadu_epi64:
   5994 ; X86:       # %bb.0: # %entry
   5995 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   5996 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   5997 ; X86-NEXT:    kmovw %ecx, %k1
   5998 ; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1}
   5999 ; X86-NEXT:    retl
   6000 ;
   6001 ; X64-LABEL: test_mm_mask_expandloadu_epi64:
   6002 ; X64:       # %bb.0: # %entry
   6003 ; X64-NEXT:    kmovw %edi, %k1
   6004 ; X64-NEXT:    vpexpandq (%rsi), %xmm0 {%k1}
   6005 ; X64-NEXT:    retq
   6006 entry:
   6007   %0 = bitcast i8* %__P to i64*
   6008   %1 = bitcast i8 %__U to <8 x i1>
   6009   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6010   %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10
   6011   ret <2 x i64> %2
   6012 }
   6013 
   6014 define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
   6015 ; X86-LABEL: test_mm_maskz_expandloadu_epi64:
   6016 ; X86:       # %bb.0: # %entry
   6017 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6018 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6019 ; X86-NEXT:    kmovw %ecx, %k1
   6020 ; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1} {z}
   6021 ; X86-NEXT:    retl
   6022 ;
   6023 ; X64-LABEL: test_mm_maskz_expandloadu_epi64:
   6024 ; X64:       # %bb.0: # %entry
   6025 ; X64-NEXT:    kmovw %edi, %k1
   6026 ; X64-NEXT:    vpexpandq (%rsi), %xmm0 {%k1} {z}
   6027 ; X64-NEXT:    retq
   6028 entry:
   6029   %0 = bitcast i8* %__P to i64*
   6030   %1 = bitcast i8 %__U to <8 x i1>
   6031   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6032   %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer)
   6033   ret <2 x i64> %2
   6034 }
   6035 
   6036 define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
   6037 ; X86-LABEL: test_mm256_mask_expandloadu_epi64:
   6038 ; X86:       # %bb.0: # %entry
   6039 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6040 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6041 ; X86-NEXT:    kmovw %ecx, %k1
   6042 ; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1}
   6043 ; X86-NEXT:    retl
   6044 ;
   6045 ; X64-LABEL: test_mm256_mask_expandloadu_epi64:
   6046 ; X64:       # %bb.0: # %entry
   6047 ; X64-NEXT:    kmovw %edi, %k1
   6048 ; X64-NEXT:    vpexpandq (%rsi), %ymm0 {%k1}
   6049 ; X64-NEXT:    retq
   6050 entry:
   6051   %0 = bitcast i8* %__P to i64*
   6052   %1 = bitcast i8 %__U to <8 x i1>
   6053   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6054   %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10
   6055   ret <4 x i64> %2
   6056 }
   6057 
   6058 define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
   6059 ; X86-LABEL: test_mm256_maskz_expandloadu_epi64:
   6060 ; X86:       # %bb.0: # %entry
   6061 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6062 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6063 ; X86-NEXT:    kmovw %ecx, %k1
   6064 ; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1} {z}
   6065 ; X86-NEXT:    retl
   6066 ;
   6067 ; X64-LABEL: test_mm256_maskz_expandloadu_epi64:
   6068 ; X64:       # %bb.0: # %entry
   6069 ; X64-NEXT:    kmovw %edi, %k1
   6070 ; X64-NEXT:    vpexpandq (%rsi), %ymm0 {%k1} {z}
   6071 ; X64-NEXT:    retq
   6072 entry:
   6073   %0 = bitcast i8* %__P to i64*
   6074   %1 = bitcast i8 %__U to <8 x i1>
   6075   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6076   %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer)
   6077   ret <4 x i64> %2
   6078 }
   6079 
   6080 define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
   6081 ; X86-LABEL: test_mm_mask_expandloadu_ps:
   6082 ; X86:       # %bb.0: # %entry
   6083 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6084 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6085 ; X86-NEXT:    kmovw %ecx, %k1
   6086 ; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1}
   6087 ; X86-NEXT:    retl
   6088 ;
   6089 ; X64-LABEL: test_mm_mask_expandloadu_ps:
   6090 ; X64:       # %bb.0: # %entry
   6091 ; X64-NEXT:    kmovw %edi, %k1
   6092 ; X64-NEXT:    vexpandps (%rsi), %xmm0 {%k1}
   6093 ; X64-NEXT:    retq
   6094 entry:
   6095   %0 = bitcast i8* %__P to float*
   6096   %1 = bitcast i8 %__U to <8 x i1>
   6097   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6098   %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W)
   6099   ret <4 x float> %2
   6100 }
   6101 
   6102 define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
   6103 ; X86-LABEL: test_mm_maskz_expandloadu_ps:
   6104 ; X86:       # %bb.0: # %entry
   6105 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6106 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6107 ; X86-NEXT:    kmovw %ecx, %k1
   6108 ; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1} {z}
   6109 ; X86-NEXT:    retl
   6110 ;
   6111 ; X64-LABEL: test_mm_maskz_expandloadu_ps:
   6112 ; X64:       # %bb.0: # %entry
   6113 ; X64-NEXT:    kmovw %edi, %k1
   6114 ; X64-NEXT:    vexpandps (%rsi), %xmm0 {%k1} {z}
   6115 ; X64-NEXT:    retq
   6116 entry:
   6117   %0 = bitcast i8* %__P to float*
   6118   %1 = bitcast i8 %__U to <8 x i1>
   6119   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6120   %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer)
   6121   ret <4 x float> %2
   6122 }
   6123 
   6124 define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
   6125 ; X86-LABEL: test_mm256_mask_expandloadu_ps:
   6126 ; X86:       # %bb.0: # %entry
   6127 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6128 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6129 ; X86-NEXT:    kmovw %ecx, %k1
   6130 ; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1}
   6131 ; X86-NEXT:    retl
   6132 ;
   6133 ; X64-LABEL: test_mm256_mask_expandloadu_ps:
   6134 ; X64:       # %bb.0: # %entry
   6135 ; X64-NEXT:    kmovw %edi, %k1
   6136 ; X64-NEXT:    vexpandps (%rsi), %ymm0 {%k1}
   6137 ; X64-NEXT:    retq
   6138 entry:
   6139   %0 = bitcast i8* %__P to float*
   6140   %1 = bitcast i8 %__U to <8 x i1>
   6141   %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W)
   6142   ret <8 x float> %2
   6143 }
   6144 
   6145 define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
   6146 ; X86-LABEL: test_mm256_maskz_expandloadu_ps:
   6147 ; X86:       # %bb.0: # %entry
   6148 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6149 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6150 ; X86-NEXT:    kmovw %ecx, %k1
   6151 ; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1} {z}
   6152 ; X86-NEXT:    retl
   6153 ;
   6154 ; X64-LABEL: test_mm256_maskz_expandloadu_ps:
   6155 ; X64:       # %bb.0: # %entry
   6156 ; X64-NEXT:    kmovw %edi, %k1
   6157 ; X64-NEXT:    vexpandps (%rsi), %ymm0 {%k1} {z}
   6158 ; X64-NEXT:    retq
   6159 entry:
   6160   %0 = bitcast i8* %__P to float*
   6161   %1 = bitcast i8 %__U to <8 x i1>
   6162   %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer)
   6163   ret <8 x float> %2
   6164 }
   6165 
   6166 define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
   6167 ; X86-LABEL: test_mm_mask_expandloadu_epi32:
   6168 ; X86:       # %bb.0: # %entry
   6169 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6170 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6171 ; X86-NEXT:    kmovw %ecx, %k1
   6172 ; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1}
   6173 ; X86-NEXT:    retl
   6174 ;
   6175 ; X64-LABEL: test_mm_mask_expandloadu_epi32:
   6176 ; X64:       # %bb.0: # %entry
   6177 ; X64-NEXT:    kmovw %edi, %k1
   6178 ; X64-NEXT:    vpexpandd (%rsi), %xmm0 {%k1}
   6179 ; X64-NEXT:    retq
   6180 entry:
   6181   %0 = bitcast <2 x i64> %__W to <4 x i32>
   6182   %1 = bitcast i8* %__P to i32*
   6183   %2 = bitcast i8 %__U to <8 x i1>
   6184   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6185   %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0)
   6186   %4 = bitcast <4 x i32> %3 to <2 x i64>
   6187   ret <2 x i64> %4
   6188 }
   6189 
   6190 define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
   6191 ; X86-LABEL: test_mm_maskz_expandloadu_epi32:
   6192 ; X86:       # %bb.0: # %entry
   6193 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6194 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6195 ; X86-NEXT:    kmovw %ecx, %k1
   6196 ; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1} {z}
   6197 ; X86-NEXT:    retl
   6198 ;
   6199 ; X64-LABEL: test_mm_maskz_expandloadu_epi32:
   6200 ; X64:       # %bb.0: # %entry
   6201 ; X64-NEXT:    kmovw %edi, %k1
   6202 ; X64-NEXT:    vpexpandd (%rsi), %xmm0 {%k1} {z}
   6203 ; X64-NEXT:    retq
   6204 entry:
   6205   %0 = bitcast i8* %__P to i32*
   6206   %1 = bitcast i8 %__U to <8 x i1>
   6207   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6208   %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer)
   6209   %3 = bitcast <4 x i32> %2 to <2 x i64>
   6210   ret <2 x i64> %3
   6211 }
   6212 
   6213 define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
   6214 ; X86-LABEL: test_mm256_mask_expandloadu_epi32:
   6215 ; X86:       # %bb.0: # %entry
   6216 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6217 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6218 ; X86-NEXT:    kmovw %ecx, %k1
   6219 ; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1}
   6220 ; X86-NEXT:    retl
   6221 ;
   6222 ; X64-LABEL: test_mm256_mask_expandloadu_epi32:
   6223 ; X64:       # %bb.0: # %entry
   6224 ; X64-NEXT:    kmovw %edi, %k1
   6225 ; X64-NEXT:    vpexpandd (%rsi), %ymm0 {%k1}
   6226 ; X64-NEXT:    retq
   6227 entry:
   6228   %0 = bitcast <4 x i64> %__W to <8 x i32>
   6229   %1 = bitcast i8* %__P to i32*
   6230   %2 = bitcast i8 %__U to <8 x i1>
   6231   %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0)
   6232   %4 = bitcast <8 x i32> %3 to <4 x i64>
   6233   ret <4 x i64> %4
   6234 }
   6235 
   6236 define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
   6237 ; X86-LABEL: test_mm256_maskz_expandloadu_epi32:
   6238 ; X86:       # %bb.0: # %entry
   6239 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6240 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6241 ; X86-NEXT:    kmovw %ecx, %k1
   6242 ; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1} {z}
   6243 ; X86-NEXT:    retl
   6244 ;
   6245 ; X64-LABEL: test_mm256_maskz_expandloadu_epi32:
   6246 ; X64:       # %bb.0: # %entry
   6247 ; X64-NEXT:    kmovw %edi, %k1
   6248 ; X64-NEXT:    vpexpandd (%rsi), %ymm0 {%k1} {z}
   6249 ; X64-NEXT:    retq
   6250 entry:
   6251   %0 = bitcast i8* %__P to i32*
   6252   %1 = bitcast i8 %__U to <8 x i1>
   6253   %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer)
   6254   %3 = bitcast <8 x i32> %2 to <4 x i64>
   6255   ret <4 x i64> %3
   6256 }
   6257 
   6258 define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) {
   6259 ; X86-LABEL: test_mm_mask_compressstoreu_pd:
   6260 ; X86:       # %bb.0: # %entry
   6261 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6262 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6263 ; X86-NEXT:    kmovw %eax, %k1
   6264 ; X86-NEXT:    vcompresspd %xmm0, (%ecx) {%k1}
   6265 ; X86-NEXT:    retl
   6266 ;
   6267 ; X64-LABEL: test_mm_mask_compressstoreu_pd:
   6268 ; X64:       # %bb.0: # %entry
   6269 ; X64-NEXT:    kmovw %esi, %k1
   6270 ; X64-NEXT:    vcompresspd %xmm0, (%rdi) {%k1}
   6271 ; X64-NEXT:    retq
   6272 entry:
   6273   %0 = bitcast i8* %__P to double*
   6274   %1 = bitcast i8 %__U to <8 x i1>
   6275   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6276   tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i)
   6277   ret void
   6278 }
   6279 
   6280 define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) {
   6281 ; X86-LABEL: test_mm256_mask_compressstoreu_pd:
   6282 ; X86:       # %bb.0: # %entry
   6283 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6284 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6285 ; X86-NEXT:    kmovw %eax, %k1
   6286 ; X86-NEXT:    vcompresspd %ymm0, (%ecx) {%k1}
   6287 ; X86-NEXT:    vzeroupper
   6288 ; X86-NEXT:    retl
   6289 ;
   6290 ; X64-LABEL: test_mm256_mask_compressstoreu_pd:
   6291 ; X64:       # %bb.0: # %entry
   6292 ; X64-NEXT:    kmovw %esi, %k1
   6293 ; X64-NEXT:    vcompresspd %ymm0, (%rdi) {%k1}
   6294 ; X64-NEXT:    vzeroupper
   6295 ; X64-NEXT:    retq
   6296 entry:
   6297   %0 = bitcast i8* %__P to double*
   6298   %1 = bitcast i8 %__U to <8 x i1>
   6299   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6300   tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i)
   6301   ret void
   6302 }
   6303 
   6304 define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
   6305 ; X86-LABEL: test_mm_mask_compressstoreu_epi64:
   6306 ; X86:       # %bb.0: # %entry
   6307 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6308 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6309 ; X86-NEXT:    kmovw %eax, %k1
   6310 ; X86-NEXT:    vpcompressq %xmm0, (%ecx) {%k1}
   6311 ; X86-NEXT:    retl
   6312 ;
   6313 ; X64-LABEL: test_mm_mask_compressstoreu_epi64:
   6314 ; X64:       # %bb.0: # %entry
   6315 ; X64-NEXT:    kmovw %esi, %k1
   6316 ; X64-NEXT:    vpcompressq %xmm0, (%rdi) {%k1}
   6317 ; X64-NEXT:    retq
   6318 entry:
   6319   %0 = bitcast i8* %__P to i64*
   6320   %1 = bitcast i8 %__U to <8 x i1>
   6321   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6322   tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i)
   6323   ret void
   6324 }
   6325 
   6326 define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
   6327 ; X86-LABEL: test_mm256_mask_compressstoreu_epi64:
   6328 ; X86:       # %bb.0: # %entry
   6329 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6330 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6331 ; X86-NEXT:    kmovw %eax, %k1
   6332 ; X86-NEXT:    vpcompressq %ymm0, (%ecx) {%k1}
   6333 ; X86-NEXT:    vzeroupper
   6334 ; X86-NEXT:    retl
   6335 ;
   6336 ; X64-LABEL: test_mm256_mask_compressstoreu_epi64:
   6337 ; X64:       # %bb.0: # %entry
   6338 ; X64-NEXT:    kmovw %esi, %k1
   6339 ; X64-NEXT:    vpcompressq %ymm0, (%rdi) {%k1}
   6340 ; X64-NEXT:    vzeroupper
   6341 ; X64-NEXT:    retq
   6342 entry:
   6343   %0 = bitcast i8* %__P to i64*
   6344   %1 = bitcast i8 %__U to <8 x i1>
   6345   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6346   tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i)
   6347   ret void
   6348 }
   6349 
   6350 define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) {
   6351 ; X86-LABEL: test_mm_mask_compressstoreu_ps:
   6352 ; X86:       # %bb.0: # %entry
   6353 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6354 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6355 ; X86-NEXT:    kmovw %eax, %k1
   6356 ; X86-NEXT:    vcompressps %xmm0, (%ecx) {%k1}
   6357 ; X86-NEXT:    retl
   6358 ;
   6359 ; X64-LABEL: test_mm_mask_compressstoreu_ps:
   6360 ; X64:       # %bb.0: # %entry
   6361 ; X64-NEXT:    kmovw %esi, %k1
   6362 ; X64-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
   6363 ; X64-NEXT:    retq
   6364 entry:
   6365   %0 = bitcast i8* %__P to float*
   6366   %1 = bitcast i8 %__U to <8 x i1>
   6367   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6368   tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i)
   6369   ret void
   6370 }
   6371 
   6372 define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) {
   6373 ; X86-LABEL: test_mm256_mask_compressstoreu_ps:
   6374 ; X86:       # %bb.0: # %entry
   6375 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6376 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6377 ; X86-NEXT:    kmovw %eax, %k1
   6378 ; X86-NEXT:    vcompressps %ymm0, (%ecx) {%k1}
   6379 ; X86-NEXT:    vzeroupper
   6380 ; X86-NEXT:    retl
   6381 ;
   6382 ; X64-LABEL: test_mm256_mask_compressstoreu_ps:
   6383 ; X64:       # %bb.0: # %entry
   6384 ; X64-NEXT:    kmovw %esi, %k1
   6385 ; X64-NEXT:    vcompressps %ymm0, (%rdi) {%k1}
   6386 ; X64-NEXT:    vzeroupper
   6387 ; X64-NEXT:    retq
   6388 entry:
   6389   %0 = bitcast i8* %__P to float*
   6390   %1 = bitcast i8 %__U to <8 x i1>
   6391   tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1)
   6392   ret void
   6393 }
   6394 
   6395 define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
   6396 ; X86-LABEL: test_mm_mask_compressstoreu_epi32:
   6397 ; X86:       # %bb.0: # %entry
   6398 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6399 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6400 ; X86-NEXT:    kmovw %eax, %k1
   6401 ; X86-NEXT:    vpcompressd %xmm0, (%ecx) {%k1}
   6402 ; X86-NEXT:    retl
   6403 ;
   6404 ; X64-LABEL: test_mm_mask_compressstoreu_epi32:
   6405 ; X64:       # %bb.0: # %entry
   6406 ; X64-NEXT:    kmovw %esi, %k1
   6407 ; X64-NEXT:    vpcompressd %xmm0, (%rdi) {%k1}
   6408 ; X64-NEXT:    retq
   6409 entry:
   6410   %0 = bitcast <2 x i64> %__A to <4 x i32>
   6411   %1 = bitcast i8* %__P to i32*
   6412   %2 = bitcast i8 %__U to <8 x i1>
   6413   %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6414   tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i)
   6415   ret void
   6416 }
   6417 
   6418 define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
   6419 ; X86-LABEL: test_mm256_mask_compressstoreu_epi32:
   6420 ; X86:       # %bb.0: # %entry
   6421 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6422 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6423 ; X86-NEXT:    kmovw %eax, %k1
   6424 ; X86-NEXT:    vpcompressd %ymm0, (%ecx) {%k1}
   6425 ; X86-NEXT:    vzeroupper
   6426 ; X86-NEXT:    retl
   6427 ;
   6428 ; X64-LABEL: test_mm256_mask_compressstoreu_epi32:
   6429 ; X64:       # %bb.0: # %entry
   6430 ; X64-NEXT:    kmovw %esi, %k1
   6431 ; X64-NEXT:    vpcompressd %ymm0, (%rdi) {%k1}
   6432 ; X64-NEXT:    vzeroupper
   6433 ; X64-NEXT:    retq
   6434 entry:
   6435   %0 = bitcast <4 x i64> %__A to <8 x i32>
   6436   %1 = bitcast i8* %__P to i32*
   6437   %2 = bitcast i8 %__U to <8 x i1>
   6438   tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10
   6439   ret void
   6440 }
   6441 
   6442 
   6443 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
   6444 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
   6445 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
   6446 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8
   6447 
   6448 define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
   6449 ; X86-LABEL: test_mm_mask_sqrt_pd:
   6450 ; X86:       # %bb.0: # %entry
   6451 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6452 ; X86-NEXT:    kmovw %eax, %k1
   6453 ; X86-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
   6454 ; X86-NEXT:    retl
   6455 ;
   6456 ; X64-LABEL: test_mm_mask_sqrt_pd:
   6457 ; X64:       # %bb.0: # %entry
   6458 ; X64-NEXT:    kmovw %edi, %k1
   6459 ; X64-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
   6460 ; X64-NEXT:    retq
   6461 entry:
   6462   %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
   6463   %1 = bitcast i8 %__U to <8 x i1>
   6464   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6465   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W
   6466   ret <2 x double> %2
   6467 }
   6468 
   6469 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
   6470 
   6471 define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) {
   6472 ; X86-LABEL: test_mm_maskz_sqrt_pd:
   6473 ; X86:       # %bb.0: # %entry
   6474 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6475 ; X86-NEXT:    kmovw %eax, %k1
   6476 ; X86-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
   6477 ; X86-NEXT:    retl
   6478 ;
   6479 ; X64-LABEL: test_mm_maskz_sqrt_pd:
   6480 ; X64:       # %bb.0: # %entry
   6481 ; X64-NEXT:    kmovw %edi, %k1
   6482 ; X64-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
   6483 ; X64-NEXT:    retq
   6484 entry:
   6485   %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
   6486   %1 = bitcast i8 %__U to <8 x i1>
   6487   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6488   %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
   6489   ret <2 x double> %2
   6490 }
   6491 
   6492 define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
   6493 ; X86-LABEL: test_mm256_mask_sqrt_pd:
   6494 ; X86:       # %bb.0: # %entry
   6495 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6496 ; X86-NEXT:    kmovw %eax, %k1
   6497 ; X86-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
   6498 ; X86-NEXT:    retl
   6499 ;
   6500 ; X64-LABEL: test_mm256_mask_sqrt_pd:
   6501 ; X64:       # %bb.0: # %entry
   6502 ; X64-NEXT:    kmovw %edi, %k1
   6503 ; X64-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
   6504 ; X64-NEXT:    retq
   6505 entry:
   6506   %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
   6507   %1 = bitcast i8 %__U to <8 x i1>
   6508   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6509   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W
   6510   ret <4 x double> %2
   6511 }
   6512 
   6513 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
   6514 
   6515 define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) {
   6516 ; X86-LABEL: test_mm256_maskz_sqrt_pd:
   6517 ; X86:       # %bb.0: # %entry
   6518 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6519 ; X86-NEXT:    kmovw %eax, %k1
   6520 ; X86-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
   6521 ; X86-NEXT:    retl
   6522 ;
   6523 ; X64-LABEL: test_mm256_maskz_sqrt_pd:
   6524 ; X64:       # %bb.0: # %entry
   6525 ; X64-NEXT:    kmovw %edi, %k1
   6526 ; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
   6527 ; X64-NEXT:    retq
   6528 entry:
   6529   %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
   6530   %1 = bitcast i8 %__U to <8 x i1>
   6531   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6532   %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
   6533   ret <4 x double> %2
   6534 }
   6535 
   6536 define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
   6537 ; X86-LABEL: test_mm_mask_sqrt_ps:
   6538 ; X86:       # %bb.0: # %entry
   6539 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6540 ; X86-NEXT:    kmovw %eax, %k1
   6541 ; X86-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
   6542 ; X86-NEXT:    retl
   6543 ;
   6544 ; X64-LABEL: test_mm_mask_sqrt_ps:
   6545 ; X64:       # %bb.0: # %entry
   6546 ; X64-NEXT:    kmovw %edi, %k1
   6547 ; X64-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
   6548 ; X64-NEXT:    retq
   6549 entry:
   6550   %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
   6551   %1 = bitcast i8 %__U to <8 x i1>
   6552   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6553   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
   6554   ret <4 x float> %2
   6555 }
   6556 
   6557 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
   6558 
   6559 define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) {
   6560 ; X86-LABEL: test_mm_maskz_sqrt_ps:
   6561 ; X86:       # %bb.0: # %entry
   6562 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6563 ; X86-NEXT:    kmovw %eax, %k1
   6564 ; X86-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
   6565 ; X86-NEXT:    retl
   6566 ;
   6567 ; X64-LABEL: test_mm_maskz_sqrt_ps:
   6568 ; X64:       # %bb.0: # %entry
   6569 ; X64-NEXT:    kmovw %edi, %k1
   6570 ; X64-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
   6571 ; X64-NEXT:    retq
   6572 entry:
   6573   %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
   6574   %1 = bitcast i8 %__U to <8 x i1>
   6575   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6576   %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
   6577   ret <4 x float> %2
   6578 }
   6579 
   6580 define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) {
   6581 ; X86-LABEL: test_mm256_mask_sqrt_ps:
   6582 ; X86:       # %bb.0: # %entry
   6583 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6584 ; X86-NEXT:    kmovw %eax, %k1
   6585 ; X86-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
   6586 ; X86-NEXT:    retl
   6587 ;
   6588 ; X64-LABEL: test_mm256_mask_sqrt_ps:
   6589 ; X64:       # %bb.0: # %entry
   6590 ; X64-NEXT:    kmovw %edi, %k1
   6591 ; X64-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
   6592 ; X64-NEXT:    retq
   6593 entry:
   6594   %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
   6595   %1 = bitcast i8 %__U to <8 x i1>
   6596   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W
   6597   ret <8 x float> %2
   6598 }
   6599 
   6600 define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) {
   6601 ; X86-LABEL: test_mm256_maskz_sqrt_ps:
   6602 ; X86:       # %bb.0: # %entry
   6603 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6604 ; X86-NEXT:    kmovw %eax, %k1
   6605 ; X86-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
   6606 ; X86-NEXT:    retl
   6607 ;
   6608 ; X64-LABEL: test_mm256_maskz_sqrt_ps:
   6609 ; X64:       # %bb.0: # %entry
   6610 ; X64-NEXT:    kmovw %edi, %k1
   6611 ; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
   6612 ; X64-NEXT:    retq
   6613 entry:
   6614   %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
   6615   %1 = bitcast i8 %__U to <8 x i1>
   6616   %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
   6617   ret <8 x float> %2
   6618 }
   6619 
   6620 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
   6621 
   6622 define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) {
   6623 ; CHECK-LABEL: test_mm_rol_epi32:
   6624 ; CHECK:       # %bb.0: # %entry
   6625 ; CHECK-NEXT:    vprold $5, %xmm0, %xmm0
   6626 ; CHECK-NEXT:    ret{{[l|q]}}
   6627 entry:
   6628   %0 = bitcast <2 x i64> %__A to <4 x i32>
   6629   %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5)
   6630   %2 = bitcast <4 x i32> %1 to <2 x i64>
   6631   ret <2 x i64> %2
   6632 }
   6633 
   6634 declare <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32>, i32)
   6635 
   6636 define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
   6637 ; X86-LABEL: test_mm_mask_rol_epi32:
   6638 ; X86:       # %bb.0: # %entry
   6639 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6640 ; X86-NEXT:    kmovw %eax, %k1
   6641 ; X86-NEXT:    vprold $5, %xmm1, %xmm0 {%k1}
   6642 ; X86-NEXT:    retl
   6643 ;
   6644 ; X64-LABEL: test_mm_mask_rol_epi32:
   6645 ; X64:       # %bb.0: # %entry
   6646 ; X64-NEXT:    kmovw %edi, %k1
   6647 ; X64-NEXT:    vprold $5, %xmm1, %xmm0 {%k1}
   6648 ; X64-NEXT:    retq
   6649 entry:
   6650   %0 = bitcast <2 x i64> %__A to <4 x i32>
   6651   %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5)
   6652   %2 = bitcast <2 x i64> %__W to <4 x i32>
   6653   %3 = bitcast i8 %__U to <8 x i1>
   6654   %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6655   %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
   6656   %5 = bitcast <4 x i32> %4 to <2 x i64>
   6657   ret <2 x i64> %5
   6658 }
   6659 
   6660 define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) {
   6661 ; X86-LABEL: test_mm_maskz_rol_epi32:
   6662 ; X86:       # %bb.0: # %entry
   6663 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6664 ; X86-NEXT:    kmovw %eax, %k1
   6665 ; X86-NEXT:    vprold $5, %xmm0, %xmm0 {%k1} {z}
   6666 ; X86-NEXT:    retl
   6667 ;
   6668 ; X64-LABEL: test_mm_maskz_rol_epi32:
   6669 ; X64:       # %bb.0: # %entry
   6670 ; X64-NEXT:    kmovw %edi, %k1
   6671 ; X64-NEXT:    vprold $5, %xmm0, %xmm0 {%k1} {z}
   6672 ; X64-NEXT:    retq
   6673 entry:
   6674   %0 = bitcast <2 x i64> %__A to <4 x i32>
   6675   %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5)
   6676   %2 = bitcast i8 %__U to <8 x i1>
   6677   %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6678   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
   6679   %4 = bitcast <4 x i32> %3 to <2 x i64>
   6680   ret <2 x i64> %4
   6681 }
   6682 
   6683 define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) {
   6684 ; CHECK-LABEL: test_mm256_rol_epi32:
   6685 ; CHECK:       # %bb.0: # %entry
   6686 ; CHECK-NEXT:    vprold $5, %ymm0, %ymm0
   6687 ; CHECK-NEXT:    ret{{[l|q]}}
   6688 entry:
   6689   %0 = bitcast <4 x i64> %__A to <8 x i32>
   6690   %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5)
   6691   %2 = bitcast <8 x i32> %1 to <4 x i64>
   6692   ret <4 x i64> %2
   6693 }
   6694 
   6695 declare <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32>, i32)
   6696 
   6697 define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
   6698 ; X86-LABEL: test_mm256_mask_rol_epi32:
   6699 ; X86:       # %bb.0: # %entry
   6700 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6701 ; X86-NEXT:    kmovw %eax, %k1
   6702 ; X86-NEXT:    vprold $5, %ymm1, %ymm0 {%k1}
   6703 ; X86-NEXT:    retl
   6704 ;
   6705 ; X64-LABEL: test_mm256_mask_rol_epi32:
   6706 ; X64:       # %bb.0: # %entry
   6707 ; X64-NEXT:    kmovw %edi, %k1
   6708 ; X64-NEXT:    vprold $5, %ymm1, %ymm0 {%k1}
   6709 ; X64-NEXT:    retq
   6710 entry:
   6711   %0 = bitcast <4 x i64> %__A to <8 x i32>
   6712   %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5)
   6713   %2 = bitcast <4 x i64> %__W to <8 x i32>
   6714   %3 = bitcast i8 %__U to <8 x i1>
   6715   %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
   6716   %5 = bitcast <8 x i32> %4 to <4 x i64>
   6717   ret <4 x i64> %5
   6718 }
   6719 
   6720 define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) {
   6721 ; X86-LABEL: test_mm256_maskz_rol_epi32:
   6722 ; X86:       # %bb.0: # %entry
   6723 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6724 ; X86-NEXT:    kmovw %eax, %k1
   6725 ; X86-NEXT:    vprold $5, %ymm0, %ymm0 {%k1} {z}
   6726 ; X86-NEXT:    retl
   6727 ;
   6728 ; X64-LABEL: test_mm256_maskz_rol_epi32:
   6729 ; X64:       # %bb.0: # %entry
   6730 ; X64-NEXT:    kmovw %edi, %k1
   6731 ; X64-NEXT:    vprold $5, %ymm0, %ymm0 {%k1} {z}
   6732 ; X64-NEXT:    retq
   6733 entry:
   6734   %0 = bitcast <4 x i64> %__A to <8 x i32>
   6735   %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5)
   6736   %2 = bitcast i8 %__U to <8 x i1>
   6737   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
   6738   %4 = bitcast <8 x i32> %3 to <4 x i64>
   6739   ret <4 x i64> %4
   6740 }
   6741 
   6742 define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) {
   6743 ; CHECK-LABEL: test_mm_rol_epi64:
   6744 ; CHECK:       # %bb.0: # %entry
   6745 ; CHECK-NEXT:    vprolq $5, %xmm0, %xmm0
   6746 ; CHECK-NEXT:    ret{{[l|q]}}
   6747 entry:
   6748   %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5)
   6749   ret <2 x i64> %0
   6750 }
   6751 
   6752 declare <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64>, i32)
   6753 
   6754 define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
   6755 ; X86-LABEL: test_mm_mask_rol_epi64:
   6756 ; X86:       # %bb.0: # %entry
   6757 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6758 ; X86-NEXT:    kmovw %eax, %k1
   6759 ; X86-NEXT:    vprolq $5, %xmm1, %xmm0 {%k1}
   6760 ; X86-NEXT:    retl
   6761 ;
   6762 ; X64-LABEL: test_mm_mask_rol_epi64:
   6763 ; X64:       # %bb.0: # %entry
   6764 ; X64-NEXT:    kmovw %edi, %k1
   6765 ; X64-NEXT:    vprolq $5, %xmm1, %xmm0 {%k1}
   6766 ; X64-NEXT:    retq
   6767 entry:
   6768   %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5)
   6769   %1 = bitcast i8 %__U to <8 x i1>
   6770   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6771   %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
   6772   ret <2 x i64> %2
   6773 }
   6774 
   6775 define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) {
   6776 ; X86-LABEL: test_mm_maskz_rol_epi64:
   6777 ; X86:       # %bb.0: # %entry
   6778 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6779 ; X86-NEXT:    kmovw %eax, %k1
   6780 ; X86-NEXT:    vprolq $5, %xmm0, %xmm0 {%k1} {z}
   6781 ; X86-NEXT:    retl
   6782 ;
   6783 ; X64-LABEL: test_mm_maskz_rol_epi64:
   6784 ; X64:       # %bb.0: # %entry
   6785 ; X64-NEXT:    kmovw %edi, %k1
   6786 ; X64-NEXT:    vprolq $5, %xmm0, %xmm0 {%k1} {z}
   6787 ; X64-NEXT:    retq
   6788 entry:
   6789   %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5)
   6790   %1 = bitcast i8 %__U to <8 x i1>
   6791   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6792   %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
   6793   ret <2 x i64> %2
   6794 }
   6795 
   6796 define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) {
   6797 ; CHECK-LABEL: test_mm256_rol_epi64:
   6798 ; CHECK:       # %bb.0: # %entry
   6799 ; CHECK-NEXT:    vprolq $5, %ymm0, %ymm0
   6800 ; CHECK-NEXT:    ret{{[l|q]}}
   6801 entry:
   6802   %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5)
   6803   ret <4 x i64> %0
   6804 }
   6805 
   6806 declare <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64>, i32)
   6807 
   6808 define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
   6809 ; X86-LABEL: test_mm256_mask_rol_epi64:
   6810 ; X86:       # %bb.0: # %entry
   6811 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6812 ; X86-NEXT:    kmovw %eax, %k1
   6813 ; X86-NEXT:    vprolq $5, %ymm1, %ymm0 {%k1}
   6814 ; X86-NEXT:    retl
   6815 ;
   6816 ; X64-LABEL: test_mm256_mask_rol_epi64:
   6817 ; X64:       # %bb.0: # %entry
   6818 ; X64-NEXT:    kmovw %edi, %k1
   6819 ; X64-NEXT:    vprolq $5, %ymm1, %ymm0 {%k1}
   6820 ; X64-NEXT:    retq
   6821 entry:
   6822   %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5)
   6823   %1 = bitcast i8 %__U to <8 x i1>
   6824   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6825   %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
   6826   ret <4 x i64> %2
   6827 }
   6828 
   6829 define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) {
   6830 ; X86-LABEL: test_mm256_maskz_rol_epi64:
   6831 ; X86:       # %bb.0: # %entry
   6832 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6833 ; X86-NEXT:    kmovw %eax, %k1
   6834 ; X86-NEXT:    vprolq $5, %ymm0, %ymm0 {%k1} {z}
   6835 ; X86-NEXT:    retl
   6836 ;
   6837 ; X64-LABEL: test_mm256_maskz_rol_epi64:
   6838 ; X64:       # %bb.0: # %entry
   6839 ; X64-NEXT:    kmovw %edi, %k1
   6840 ; X64-NEXT:    vprolq $5, %ymm0, %ymm0 {%k1} {z}
   6841 ; X64-NEXT:    retq
   6842 entry:
   6843   %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5)
   6844   %1 = bitcast i8 %__U to <8 x i1>
   6845   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6846   %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
   6847   ret <4 x i64> %2
   6848 }
   6849 
   6850 define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
   6851 ; CHECK-LABEL: test_mm_rolv_epi32:
   6852 ; CHECK:       # %bb.0: # %entry
   6853 ; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
   6854 ; CHECK-NEXT:    ret{{[l|q]}}
   6855 entry:
   6856   %0 = bitcast <2 x i64> %__A to <4 x i32>
   6857   %1 = bitcast <2 x i64> %__B to <4 x i32>
   6858   %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1)
   6859   %3 = bitcast <4 x i32> %2 to <2 x i64>
   6860   ret <2 x i64> %3
   6861 }
   6862 
   6863 define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   6864 ; X86-LABEL: test_mm_mask_rolv_epi32:
   6865 ; X86:       # %bb.0: # %entry
   6866 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6867 ; X86-NEXT:    kmovw %eax, %k1
   6868 ; X86-NEXT:    vprolvd %xmm2, %xmm1, %xmm0 {%k1}
   6869 ; X86-NEXT:    retl
   6870 ;
   6871 ; X64-LABEL: test_mm_mask_rolv_epi32:
   6872 ; X64:       # %bb.0: # %entry
   6873 ; X64-NEXT:    kmovw %edi, %k1
   6874 ; X64-NEXT:    vprolvd %xmm2, %xmm1, %xmm0 {%k1}
   6875 ; X64-NEXT:    retq
   6876 entry:
   6877   %0 = bitcast <2 x i64> %__A to <4 x i32>
   6878   %1 = bitcast <2 x i64> %__B to <4 x i32>
   6879   %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1)
   6880   %3 = bitcast <2 x i64> %__W to <4 x i32>
   6881   %4 = bitcast i8 %__U to <8 x i1>
   6882   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6883   %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
   6884   %6 = bitcast <4 x i32> %5 to <2 x i64>
   6885   ret <2 x i64> %6
   6886 }
   6887 
   6888 define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   6889 ; X86-LABEL: test_mm_maskz_rolv_epi32:
   6890 ; X86:       # %bb.0: # %entry
   6891 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6892 ; X86-NEXT:    kmovw %eax, %k1
   6893 ; X86-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
   6894 ; X86-NEXT:    retl
   6895 ;
   6896 ; X64-LABEL: test_mm_maskz_rolv_epi32:
   6897 ; X64:       # %bb.0: # %entry
   6898 ; X64-NEXT:    kmovw %edi, %k1
   6899 ; X64-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
   6900 ; X64-NEXT:    retq
   6901 entry:
   6902   %0 = bitcast <2 x i64> %__A to <4 x i32>
   6903   %1 = bitcast <2 x i64> %__B to <4 x i32>
   6904   %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1)
   6905   %3 = bitcast i8 %__U to <8 x i1>
   6906   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6907   %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
   6908   %5 = bitcast <4 x i32> %4 to <2 x i64>
   6909   ret <2 x i64> %5
   6910 }
   6911 
   6912 define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
   6913 ; CHECK-LABEL: test_mm256_rolv_epi32:
   6914 ; CHECK:       # %bb.0: # %entry
   6915 ; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
   6916 ; CHECK-NEXT:    ret{{[l|q]}}
   6917 entry:
   6918   %0 = bitcast <4 x i64> %__A to <8 x i32>
   6919   %1 = bitcast <4 x i64> %__B to <8 x i32>
   6920   %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1)
   6921   %3 = bitcast <8 x i32> %2 to <4 x i64>
   6922   ret <4 x i64> %3
   6923 }
   6924 
   6925 define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   6926 ; X86-LABEL: test_mm256_mask_rolv_epi32:
   6927 ; X86:       # %bb.0: # %entry
   6928 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6929 ; X86-NEXT:    kmovw %eax, %k1
   6930 ; X86-NEXT:    vprolvd %ymm2, %ymm1, %ymm0 {%k1}
   6931 ; X86-NEXT:    retl
   6932 ;
   6933 ; X64-LABEL: test_mm256_mask_rolv_epi32:
   6934 ; X64:       # %bb.0: # %entry
   6935 ; X64-NEXT:    kmovw %edi, %k1
   6936 ; X64-NEXT:    vprolvd %ymm2, %ymm1, %ymm0 {%k1}
   6937 ; X64-NEXT:    retq
   6938 entry:
   6939   %0 = bitcast <4 x i64> %__A to <8 x i32>
   6940   %1 = bitcast <4 x i64> %__B to <8 x i32>
   6941   %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1)
   6942   %3 = bitcast <4 x i64> %__W to <8 x i32>
   6943   %4 = bitcast i8 %__U to <8 x i1>
   6944   %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
   6945   %6 = bitcast <8 x i32> %5 to <4 x i64>
   6946   ret <4 x i64> %6
   6947 }
   6948 
   6949 define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   6950 ; X86-LABEL: test_mm256_maskz_rolv_epi32:
   6951 ; X86:       # %bb.0: # %entry
   6952 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6953 ; X86-NEXT:    kmovw %eax, %k1
   6954 ; X86-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
   6955 ; X86-NEXT:    retl
   6956 ;
   6957 ; X64-LABEL: test_mm256_maskz_rolv_epi32:
   6958 ; X64:       # %bb.0: # %entry
   6959 ; X64-NEXT:    kmovw %edi, %k1
   6960 ; X64-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
   6961 ; X64-NEXT:    retq
   6962 entry:
   6963   %0 = bitcast <4 x i64> %__A to <8 x i32>
   6964   %1 = bitcast <4 x i64> %__B to <8 x i32>
   6965   %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1)
   6966   %3 = bitcast i8 %__U to <8 x i1>
   6967   %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
   6968   %5 = bitcast <8 x i32> %4 to <4 x i64>
   6969   ret <4 x i64> %5
   6970 }
   6971 
   6972 define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
   6973 ; CHECK-LABEL: test_mm_rolv_epi64:
   6974 ; CHECK:       # %bb.0: # %entry
   6975 ; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
   6976 ; CHECK-NEXT:    ret{{[l|q]}}
   6977 entry:
   6978   %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B)
   6979   ret <2 x i64> %0
   6980 }
   6981 
   6982 define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   6983 ; X86-LABEL: test_mm_mask_rolv_epi64:
   6984 ; X86:       # %bb.0: # %entry
   6985 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6986 ; X86-NEXT:    kmovw %eax, %k1
   6987 ; X86-NEXT:    vprolvq %xmm2, %xmm1, %xmm0 {%k1}
   6988 ; X86-NEXT:    retl
   6989 ;
   6990 ; X64-LABEL: test_mm_mask_rolv_epi64:
   6991 ; X64:       # %bb.0: # %entry
   6992 ; X64-NEXT:    kmovw %edi, %k1
   6993 ; X64-NEXT:    vprolvq %xmm2, %xmm1, %xmm0 {%k1}
   6994 ; X64-NEXT:    retq
   6995 entry:
   6996   %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B)
   6997   %1 = bitcast i8 %__U to <8 x i1>
   6998   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   6999   %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
   7000   ret <2 x i64> %2
   7001 }
   7002 
   7003 define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   7004 ; X86-LABEL: test_mm_maskz_rolv_epi64:
   7005 ; X86:       # %bb.0: # %entry
   7006 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7007 ; X86-NEXT:    kmovw %eax, %k1
   7008 ; X86-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
   7009 ; X86-NEXT:    retl
   7010 ;
   7011 ; X64-LABEL: test_mm_maskz_rolv_epi64:
   7012 ; X64:       # %bb.0: # %entry
   7013 ; X64-NEXT:    kmovw %edi, %k1
   7014 ; X64-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
   7015 ; X64-NEXT:    retq
   7016 entry:
   7017   %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B)
   7018   %1 = bitcast i8 %__U to <8 x i1>
   7019   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   7020   %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
   7021   ret <2 x i64> %2
   7022 }
   7023 
   7024 define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
   7025 ; CHECK-LABEL: test_mm256_rolv_epi64:
   7026 ; CHECK:       # %bb.0: # %entry
   7027 ; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
   7028 ; CHECK-NEXT:    ret{{[l|q]}}
   7029 entry:
   7030   %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B)
   7031   ret <4 x i64> %0
   7032 }
   7033 
   7034 define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   7035 ; X86-LABEL: test_mm256_mask_rolv_epi64:
   7036 ; X86:       # %bb.0: # %entry
   7037 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7038 ; X86-NEXT:    kmovw %eax, %k1
   7039 ; X86-NEXT:    vprolvq %ymm2, %ymm1, %ymm0 {%k1}
   7040 ; X86-NEXT:    retl
   7041 ;
   7042 ; X64-LABEL: test_mm256_mask_rolv_epi64:
   7043 ; X64:       # %bb.0: # %entry
   7044 ; X64-NEXT:    kmovw %edi, %k1
   7045 ; X64-NEXT:    vprolvq %ymm2, %ymm1, %ymm0 {%k1}
   7046 ; X64-NEXT:    retq
   7047 entry:
   7048   %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B)
   7049   %1 = bitcast i8 %__U to <8 x i1>
   7050   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7051   %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
   7052   ret <4 x i64> %2
   7053 }
   7054 
   7055 define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   7056 ; X86-LABEL: test_mm256_maskz_rolv_epi64:
   7057 ; X86:       # %bb.0: # %entry
   7058 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7059 ; X86-NEXT:    kmovw %eax, %k1
   7060 ; X86-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
   7061 ; X86-NEXT:    retl
   7062 ;
   7063 ; X64-LABEL: test_mm256_maskz_rolv_epi64:
   7064 ; X64:       # %bb.0: # %entry
   7065 ; X64-NEXT:    kmovw %edi, %k1
   7066 ; X64-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
   7067 ; X64-NEXT:    retq
   7068 entry:
   7069   %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B)
   7070   %1 = bitcast i8 %__U to <8 x i1>
   7071   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7072   %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
   7073   ret <4 x i64> %2
   7074 }
   7075 
   7076 define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) {
   7077 ; CHECK-LABEL: test_mm_ror_epi32:
   7078 ; CHECK:       # %bb.0: # %entry
   7079 ; CHECK-NEXT:    vprord $5, %xmm0, %xmm0
   7080 ; CHECK-NEXT:    ret{{[l|q]}}
   7081 entry:
   7082   %0 = bitcast <2 x i64> %__A to <4 x i32>
   7083   %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5)
   7084   %2 = bitcast <4 x i32> %1 to <2 x i64>
   7085   ret <2 x i64> %2
   7086 }
   7087 
   7088 declare <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32>, i32)
   7089 
   7090 define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
   7091 ; X86-LABEL: test_mm_mask_ror_epi32:
   7092 ; X86:       # %bb.0: # %entry
   7093 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7094 ; X86-NEXT:    kmovw %eax, %k1
   7095 ; X86-NEXT:    vprord $5, %xmm1, %xmm0 {%k1}
   7096 ; X86-NEXT:    retl
   7097 ;
   7098 ; X64-LABEL: test_mm_mask_ror_epi32:
   7099 ; X64:       # %bb.0: # %entry
   7100 ; X64-NEXT:    kmovw %edi, %k1
   7101 ; X64-NEXT:    vprord $5, %xmm1, %xmm0 {%k1}
   7102 ; X64-NEXT:    retq
   7103 entry:
   7104   %0 = bitcast <2 x i64> %__A to <4 x i32>
   7105   %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5)
   7106   %2 = bitcast <2 x i64> %__W to <4 x i32>
   7107   %3 = bitcast i8 %__U to <8 x i1>
   7108   %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7109   %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
   7110   %5 = bitcast <4 x i32> %4 to <2 x i64>
   7111   ret <2 x i64> %5
   7112 }
   7113 
   7114 define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) {
   7115 ; X86-LABEL: test_mm_maskz_ror_epi32:
   7116 ; X86:       # %bb.0: # %entry
   7117 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7118 ; X86-NEXT:    kmovw %eax, %k1
   7119 ; X86-NEXT:    vprord $5, %xmm0, %xmm0 {%k1} {z}
   7120 ; X86-NEXT:    retl
   7121 ;
   7122 ; X64-LABEL: test_mm_maskz_ror_epi32:
   7123 ; X64:       # %bb.0: # %entry
   7124 ; X64-NEXT:    kmovw %edi, %k1
   7125 ; X64-NEXT:    vprord $5, %xmm0, %xmm0 {%k1} {z}
   7126 ; X64-NEXT:    retq
   7127 entry:
   7128   %0 = bitcast <2 x i64> %__A to <4 x i32>
   7129   %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5)
   7130   %2 = bitcast i8 %__U to <8 x i1>
   7131   %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7132   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
   7133   %4 = bitcast <4 x i32> %3 to <2 x i64>
   7134   ret <2 x i64> %4
   7135 }
   7136 
   7137 define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) {
   7138 ; CHECK-LABEL: test_mm256_ror_epi32:
   7139 ; CHECK:       # %bb.0: # %entry
   7140 ; CHECK-NEXT:    vprord $5, %ymm0, %ymm0
   7141 ; CHECK-NEXT:    ret{{[l|q]}}
   7142 entry:
   7143   %0 = bitcast <4 x i64> %__A to <8 x i32>
   7144   %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5)
   7145   %2 = bitcast <8 x i32> %1 to <4 x i64>
   7146   ret <4 x i64> %2
   7147 }
   7148 
   7149 declare <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32>, i32)
   7150 
   7151 define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
   7152 ; X86-LABEL: test_mm256_mask_ror_epi32:
   7153 ; X86:       # %bb.0: # %entry
   7154 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7155 ; X86-NEXT:    kmovw %eax, %k1
   7156 ; X86-NEXT:    vprord $5, %ymm1, %ymm0 {%k1}
   7157 ; X86-NEXT:    retl
   7158 ;
   7159 ; X64-LABEL: test_mm256_mask_ror_epi32:
   7160 ; X64:       # %bb.0: # %entry
   7161 ; X64-NEXT:    kmovw %edi, %k1
   7162 ; X64-NEXT:    vprord $5, %ymm1, %ymm0 {%k1}
   7163 ; X64-NEXT:    retq
   7164 entry:
   7165   %0 = bitcast <4 x i64> %__A to <8 x i32>
   7166   %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5)
   7167   %2 = bitcast <4 x i64> %__W to <8 x i32>
   7168   %3 = bitcast i8 %__U to <8 x i1>
   7169   %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
   7170   %5 = bitcast <8 x i32> %4 to <4 x i64>
   7171   ret <4 x i64> %5
   7172 }
   7173 
   7174 define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) {
   7175 ; X86-LABEL: test_mm256_maskz_ror_epi32:
   7176 ; X86:       # %bb.0: # %entry
   7177 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7178 ; X86-NEXT:    kmovw %eax, %k1
   7179 ; X86-NEXT:    vprord $5, %ymm0, %ymm0 {%k1} {z}
   7180 ; X86-NEXT:    retl
   7181 ;
   7182 ; X64-LABEL: test_mm256_maskz_ror_epi32:
   7183 ; X64:       # %bb.0: # %entry
   7184 ; X64-NEXT:    kmovw %edi, %k1
   7185 ; X64-NEXT:    vprord $5, %ymm0, %ymm0 {%k1} {z}
   7186 ; X64-NEXT:    retq
   7187 entry:
   7188   %0 = bitcast <4 x i64> %__A to <8 x i32>
   7189   %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5)
   7190   %2 = bitcast i8 %__U to <8 x i1>
   7191   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
   7192   %4 = bitcast <8 x i32> %3 to <4 x i64>
   7193   ret <4 x i64> %4
   7194 }
   7195 
   7196 define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) {
   7197 ; CHECK-LABEL: test_mm_ror_epi64:
   7198 ; CHECK:       # %bb.0: # %entry
   7199 ; CHECK-NEXT:    vprorq $5, %xmm0, %xmm0
   7200 ; CHECK-NEXT:    ret{{[l|q]}}
   7201 entry:
   7202   %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5)
   7203   ret <2 x i64> %0
   7204 }
   7205 
   7206 declare <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64>, i32)
   7207 
   7208 define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
   7209 ; X86-LABEL: test_mm_mask_ror_epi64:
   7210 ; X86:       # %bb.0: # %entry
   7211 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7212 ; X86-NEXT:    kmovw %eax, %k1
   7213 ; X86-NEXT:    vprorq $5, %xmm1, %xmm0 {%k1}
   7214 ; X86-NEXT:    retl
   7215 ;
   7216 ; X64-LABEL: test_mm_mask_ror_epi64:
   7217 ; X64:       # %bb.0: # %entry
   7218 ; X64-NEXT:    kmovw %edi, %k1
   7219 ; X64-NEXT:    vprorq $5, %xmm1, %xmm0 {%k1}
   7220 ; X64-NEXT:    retq
   7221 entry:
   7222   %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5)
   7223   %1 = bitcast i8 %__U to <8 x i1>
   7224   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   7225   %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
   7226   ret <2 x i64> %2
   7227 }
   7228 
   7229 define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) {
   7230 ; X86-LABEL: test_mm_maskz_ror_epi64:
   7231 ; X86:       # %bb.0: # %entry
   7232 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7233 ; X86-NEXT:    kmovw %eax, %k1
   7234 ; X86-NEXT:    vprorq $5, %xmm0, %xmm0 {%k1} {z}
   7235 ; X86-NEXT:    retl
   7236 ;
   7237 ; X64-LABEL: test_mm_maskz_ror_epi64:
   7238 ; X64:       # %bb.0: # %entry
   7239 ; X64-NEXT:    kmovw %edi, %k1
   7240 ; X64-NEXT:    vprorq $5, %xmm0, %xmm0 {%k1} {z}
   7241 ; X64-NEXT:    retq
   7242 entry:
   7243   %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5)
   7244   %1 = bitcast i8 %__U to <8 x i1>
   7245   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   7246   %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
   7247   ret <2 x i64> %2
   7248 }
   7249 
   7250 define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) {
   7251 ; CHECK-LABEL: test_mm256_ror_epi64:
   7252 ; CHECK:       # %bb.0: # %entry
   7253 ; CHECK-NEXT:    vprorq $5, %ymm0, %ymm0
   7254 ; CHECK-NEXT:    ret{{[l|q]}}
   7255 entry:
   7256   %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5)
   7257   ret <4 x i64> %0
   7258 }
   7259 
   7260 declare <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64>, i32)
   7261 
   7262 define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
   7263 ; X86-LABEL: test_mm256_mask_ror_epi64:
   7264 ; X86:       # %bb.0: # %entry
   7265 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7266 ; X86-NEXT:    kmovw %eax, %k1
   7267 ; X86-NEXT:    vprorq $5, %ymm1, %ymm0 {%k1}
   7268 ; X86-NEXT:    retl
   7269 ;
   7270 ; X64-LABEL: test_mm256_mask_ror_epi64:
   7271 ; X64:       # %bb.0: # %entry
   7272 ; X64-NEXT:    kmovw %edi, %k1
   7273 ; X64-NEXT:    vprorq $5, %ymm1, %ymm0 {%k1}
   7274 ; X64-NEXT:    retq
   7275 entry:
   7276   %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5)
   7277   %1 = bitcast i8 %__U to <8 x i1>
   7278   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7279   %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
   7280   ret <4 x i64> %2
   7281 }
   7282 
   7283 define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) {
   7284 ; X86-LABEL: test_mm256_maskz_ror_epi64:
   7285 ; X86:       # %bb.0: # %entry
   7286 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7287 ; X86-NEXT:    kmovw %eax, %k1
   7288 ; X86-NEXT:    vprorq $5, %ymm0, %ymm0 {%k1} {z}
   7289 ; X86-NEXT:    retl
   7290 ;
   7291 ; X64-LABEL: test_mm256_maskz_ror_epi64:
   7292 ; X64:       # %bb.0: # %entry
   7293 ; X64-NEXT:    kmovw %edi, %k1
   7294 ; X64-NEXT:    vprorq $5, %ymm0, %ymm0 {%k1} {z}
   7295 ; X64-NEXT:    retq
   7296 entry:
   7297   %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5)
   7298   %1 = bitcast i8 %__U to <8 x i1>
   7299   %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7300   %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
   7301   ret <4 x i64> %2
   7302 }
   7303 
   7304 define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
   7305 ; CHECK-LABEL: test_mm_rorv_epi32:
   7306 ; CHECK:       # %bb.0: # %entry
   7307 ; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm0
   7308 ; CHECK-NEXT:    ret{{[l|q]}}
   7309 entry:
   7310   %0 = bitcast <2 x i64> %__A to <4 x i32>
   7311   %1 = bitcast <2 x i64> %__B to <4 x i32>
   7312   %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1)
   7313   %3 = bitcast <4 x i32> %2 to <2 x i64>
   7314   ret <2 x i64> %3
   7315 }
   7316 
   7317 define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   7318 ; X86-LABEL: test_mm_mask_rorv_epi32:
   7319 ; X86:       # %bb.0: # %entry
   7320 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7321 ; X86-NEXT:    kmovw %eax, %k1
   7322 ; X86-NEXT:    vprorvd %xmm2, %xmm1, %xmm0 {%k1}
   7323 ; X86-NEXT:    retl
   7324 ;
   7325 ; X64-LABEL: test_mm_mask_rorv_epi32:
   7326 ; X64:       # %bb.0: # %entry
   7327 ; X64-NEXT:    kmovw %edi, %k1
   7328 ; X64-NEXT:    vprorvd %xmm2, %xmm1, %xmm0 {%k1}
   7329 ; X64-NEXT:    retq
   7330 entry:
   7331   %0 = bitcast <2 x i64> %__A to <4 x i32>
   7332   %1 = bitcast <2 x i64> %__B to <4 x i32>
   7333   %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1)
   7334   %3 = bitcast <2 x i64> %__W to <4 x i32>
   7335   %4 = bitcast i8 %__U to <8 x i1>
   7336   %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7337   %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
   7338   %6 = bitcast <4 x i32> %5 to <2 x i64>
   7339   ret <2 x i64> %6
   7340 }
   7341 
   7342 define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   7343 ; X86-LABEL: test_mm_maskz_rorv_epi32:
   7344 ; X86:       # %bb.0: # %entry
   7345 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7346 ; X86-NEXT:    kmovw %eax, %k1
   7347 ; X86-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
   7348 ; X86-NEXT:    retl
   7349 ;
   7350 ; X64-LABEL: test_mm_maskz_rorv_epi32:
   7351 ; X64:       # %bb.0: # %entry
   7352 ; X64-NEXT:    kmovw %edi, %k1
   7353 ; X64-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
   7354 ; X64-NEXT:    retq
   7355 entry:
   7356   %0 = bitcast <2 x i64> %__A to <4 x i32>
   7357   %1 = bitcast <2 x i64> %__B to <4 x i32>
   7358   %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1)
   7359   %3 = bitcast i8 %__U to <8 x i1>
   7360   %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7361   %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
   7362   %5 = bitcast <4 x i32> %4 to <2 x i64>
   7363   ret <2 x i64> %5
   7364 }
   7365 
   7366 define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
   7367 ; CHECK-LABEL: test_mm256_rorv_epi32:
   7368 ; CHECK:       # %bb.0: # %entry
   7369 ; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm0
   7370 ; CHECK-NEXT:    ret{{[l|q]}}
   7371 entry:
   7372   %0 = bitcast <4 x i64> %__A to <8 x i32>
   7373   %1 = bitcast <4 x i64> %__B to <8 x i32>
   7374   %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1)
   7375   %3 = bitcast <8 x i32> %2 to <4 x i64>
   7376   ret <4 x i64> %3
   7377 }
   7378 
   7379 define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   7380 ; X86-LABEL: test_mm256_mask_rorv_epi32:
   7381 ; X86:       # %bb.0: # %entry
   7382 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7383 ; X86-NEXT:    kmovw %eax, %k1
   7384 ; X86-NEXT:    vprorvd %ymm2, %ymm1, %ymm0 {%k1}
   7385 ; X86-NEXT:    retl
   7386 ;
   7387 ; X64-LABEL: test_mm256_mask_rorv_epi32:
   7388 ; X64:       # %bb.0: # %entry
   7389 ; X64-NEXT:    kmovw %edi, %k1
   7390 ; X64-NEXT:    vprorvd %ymm2, %ymm1, %ymm0 {%k1}
   7391 ; X64-NEXT:    retq
   7392 entry:
   7393   %0 = bitcast <4 x i64> %__A to <8 x i32>
   7394   %1 = bitcast <4 x i64> %__B to <8 x i32>
   7395   %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1)
   7396   %3 = bitcast <4 x i64> %__W to <8 x i32>
   7397   %4 = bitcast i8 %__U to <8 x i1>
   7398   %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
   7399   %6 = bitcast <8 x i32> %5 to <4 x i64>
   7400   ret <4 x i64> %6
   7401 }
   7402 
   7403 define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   7404 ; X86-LABEL: test_mm256_maskz_rorv_epi32:
   7405 ; X86:       # %bb.0: # %entry
   7406 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7407 ; X86-NEXT:    kmovw %eax, %k1
   7408 ; X86-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
   7409 ; X86-NEXT:    retl
   7410 ;
   7411 ; X64-LABEL: test_mm256_maskz_rorv_epi32:
   7412 ; X64:       # %bb.0: # %entry
   7413 ; X64-NEXT:    kmovw %edi, %k1
   7414 ; X64-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
   7415 ; X64-NEXT:    retq
   7416 entry:
   7417   %0 = bitcast <4 x i64> %__A to <8 x i32>
   7418   %1 = bitcast <4 x i64> %__B to <8 x i32>
   7419   %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1)
   7420   %3 = bitcast i8 %__U to <8 x i1>
   7421   %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
   7422   %5 = bitcast <8 x i32> %4 to <4 x i64>
   7423   ret <4 x i64> %5
   7424 }
   7425 
   7426 define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
   7427 ; CHECK-LABEL: test_mm_rorv_epi64:
   7428 ; CHECK:       # %bb.0: # %entry
   7429 ; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm0
   7430 ; CHECK-NEXT:    ret{{[l|q]}}
   7431 entry:
   7432   %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B)
   7433   ret <2 x i64> %0
   7434 }
   7435 
   7436 define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   7437 ; X86-LABEL: test_mm_mask_rorv_epi64:
   7438 ; X86:       # %bb.0: # %entry
   7439 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7440 ; X86-NEXT:    kmovw %eax, %k1
   7441 ; X86-NEXT:    vprorvq %xmm2, %xmm1, %xmm0 {%k1}
   7442 ; X86-NEXT:    retl
   7443 ;
   7444 ; X64-LABEL: test_mm_mask_rorv_epi64:
   7445 ; X64:       # %bb.0: # %entry
   7446 ; X64-NEXT:    kmovw %edi, %k1
   7447 ; X64-NEXT:    vprorvq %xmm2, %xmm1, %xmm0 {%k1}
   7448 ; X64-NEXT:    retq
   7449 entry:
   7450   %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B)
   7451   %1 = bitcast i8 %__U to <8 x i1>
   7452   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   7453   %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
   7454   ret <2 x i64> %2
   7455 }
   7456 
   7457 define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
   7458 ; X86-LABEL: test_mm_maskz_rorv_epi64:
   7459 ; X86:       # %bb.0: # %entry
   7460 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7461 ; X86-NEXT:    kmovw %eax, %k1
   7462 ; X86-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
   7463 ; X86-NEXT:    retl
   7464 ;
   7465 ; X64-LABEL: test_mm_maskz_rorv_epi64:
   7466 ; X64:       # %bb.0: # %entry
   7467 ; X64-NEXT:    kmovw %edi, %k1
   7468 ; X64-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
   7469 ; X64-NEXT:    retq
   7470 entry:
   7471   %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B)
   7472   %1 = bitcast i8 %__U to <8 x i1>
   7473   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
   7474   %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
   7475   ret <2 x i64> %2
   7476 }
   7477 
   7478 define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
   7479 ; CHECK-LABEL: test_mm256_rorv_epi64:
   7480 ; CHECK:       # %bb.0: # %entry
   7481 ; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm0
   7482 ; CHECK-NEXT:    ret{{[l|q]}}
   7483 entry:
   7484   %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B)
   7485   ret <4 x i64> %0
   7486 }
   7487 
   7488 define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   7489 ; X86-LABEL: test_mm256_mask_rorv_epi64:
   7490 ; X86:       # %bb.0: # %entry
   7491 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7492 ; X86-NEXT:    kmovw %eax, %k1
   7493 ; X86-NEXT:    vprorvq %ymm2, %ymm1, %ymm0 {%k1}
   7494 ; X86-NEXT:    retl
   7495 ;
   7496 ; X64-LABEL: test_mm256_mask_rorv_epi64:
   7497 ; X64:       # %bb.0: # %entry
   7498 ; X64-NEXT:    kmovw %edi, %k1
   7499 ; X64-NEXT:    vprorvq %ymm2, %ymm1, %ymm0 {%k1}
   7500 ; X64-NEXT:    retq
   7501 entry:
   7502   %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B)
   7503   %1 = bitcast i8 %__U to <8 x i1>
   7504   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7505   %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
   7506   ret <4 x i64> %2
   7507 }
   7508 
   7509 define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
   7510 ; X86-LABEL: test_mm256_maskz_rorv_epi64:
   7511 ; X86:       # %bb.0: # %entry
   7512 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7513 ; X86-NEXT:    kmovw %eax, %k1
   7514 ; X86-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
   7515 ; X86-NEXT:    retl
   7516 ;
   7517 ; X64-LABEL: test_mm256_maskz_rorv_epi64:
   7518 ; X64:       # %bb.0: # %entry
   7519 ; X64-NEXT:    kmovw %edi, %k1
   7520 ; X64-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
   7521 ; X64-NEXT:    retq
   7522 entry:
   7523   %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B)
   7524   %1 = bitcast i8 %__U to <8 x i1>
   7525   %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7526   %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
   7527   ret <4 x i64> %2
   7528 }
   7529 
   7530 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
   7531 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
   7532 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
   7533 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>)
   7534 declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
   7535 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>)
   7536 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
   7537 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
   7538 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
   7539 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>)
   7540 declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
   7541 declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
   7542 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
   7543 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>)
   7544 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
   7545 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
   7546 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
   7547 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>)
   7548 declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
   7549 declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
   7550 declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
   7551 declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
   7552 declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
   7553 declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
   7554 declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
   7555 declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>)
   7556 declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
   7557 declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
   7558 declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
   7559 declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>)
   7560 declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>)
   7561 declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
   7562 declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>)
   7563 declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>)
   7564 declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>)
   7565 declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>)
   7566 declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>)
   7567 declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>)
   7568 declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>)
   7569 declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>)
   7570 declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>)
   7571 declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>)
   7572 declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>)
   7573 declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>)
   7574 declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>)
   7575 declare <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32>, <4 x i32>)
   7576 declare <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32>, <8 x i32>)
   7577 declare <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64>, <2 x i64>)
   7578 declare <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64>, <4 x i64>)
   7579 declare <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32>, <4 x i32>)
   7580 declare <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32>, <8 x i32>)
   7581 declare <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64>, <2 x i64>)
   7582 declare <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64>, <4 x i64>)
   7583 
   7584 !0 = !{i32 1}
   7585