Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
      3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
      4 
      5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
      6 
      7 
      8 define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
      9 ; X86-LABEL: test_mm512_kunpackb:
     10 ; X86:       # %bb.0: # %entry
     11 ; X86-NEXT:    pushl %ebp
     12 ; X86-NEXT:    .cfi_def_cfa_offset 8
     13 ; X86-NEXT:    .cfi_offset %ebp, -8
     14 ; X86-NEXT:    movl %esp, %ebp
     15 ; X86-NEXT:    .cfi_def_cfa_register %ebp
     16 ; X86-NEXT:    andl $-64, %esp
     17 ; X86-NEXT:    subl $64, %esp
     18 ; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
     19 ; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
     20 ; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
     21 ; X86-NEXT:    kunpckbw %k0, %k1, %k1
     22 ; X86-NEXT:    vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
     23 ; X86-NEXT:    kmovw %k0, %eax
     24 ; X86-NEXT:    movzwl %ax, %eax
     25 ; X86-NEXT:    movl %ebp, %esp
     26 ; X86-NEXT:    popl %ebp
     27 ; X86-NEXT:    .cfi_def_cfa %esp, 4
     28 ; X86-NEXT:    vzeroupper
     29 ; X86-NEXT:    retl
     30 ;
     31 ; X64-LABEL: test_mm512_kunpackb:
     32 ; X64:       # %bb.0: # %entry
     33 ; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
     34 ; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
     35 ; X64-NEXT:    kunpckbw %k0, %k1, %k1
     36 ; X64-NEXT:    vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
     37 ; X64-NEXT:    kmovw %k0, %eax
     38 ; X64-NEXT:    movzwl %ax, %eax
     39 ; X64-NEXT:    vzeroupper
     40 ; X64-NEXT:    retq
     41 entry:
     42   %0 = bitcast <8 x i64> %__E to <16 x i32>
     43   %1 = bitcast <8 x i64> %__F to <16 x i32>
     44   %2 = bitcast <8 x i64> %__A to <16 x i32>
     45   %3 = bitcast <8 x i64> %__B to <16 x i32>
     46   %4 = icmp ne <16 x i32> %2, %3
     47   %5 = bitcast <8 x i64> %__C to <16 x i32>
     48   %6 = bitcast <8 x i64> %__D to <16 x i32>
     49   %7 = icmp ne <16 x i32> %5, %6
     50   %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
     51   %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
     52   %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     53   %11 = icmp ne <16 x i32> %0, %1
     54   %12 = and <16 x i1> %11, %10
     55   %13 = bitcast <16 x i1> %12 to i16
     56   ret i16 %13
     57 }
     58 
     59 define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
     60 ; X86-LABEL: test_mm512_kortestc:
     61 ; X86:       # %bb.0: # %entry
     62 ; X86-NEXT:    pushl %ebp
     63 ; X86-NEXT:    .cfi_def_cfa_offset 8
     64 ; X86-NEXT:    .cfi_offset %ebp, -8
     65 ; X86-NEXT:    movl %esp, %ebp
     66 ; X86-NEXT:    .cfi_def_cfa_register %ebp
     67 ; X86-NEXT:    andl $-64, %esp
     68 ; X86-NEXT:    subl $64, %esp
     69 ; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
     70 ; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
     71 ; X86-NEXT:    korw %k0, %k1, %k0
     72 ; X86-NEXT:    kmovw %k0, %eax
     73 ; X86-NEXT:    cmpw $-1, %ax
     74 ; X86-NEXT:    sete %al
     75 ; X86-NEXT:    andb $1, %al
     76 ; X86-NEXT:    movzbl %al, %eax
     77 ; X86-NEXT:    movl %ebp, %esp
     78 ; X86-NEXT:    popl %ebp
     79 ; X86-NEXT:    .cfi_def_cfa %esp, 4
     80 ; X86-NEXT:    vzeroupper
     81 ; X86-NEXT:    retl
     82 ;
     83 ; X64-LABEL: test_mm512_kortestc:
     84 ; X64:       # %bb.0: # %entry
     85 ; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
     86 ; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
     87 ; X64-NEXT:    korw %k0, %k1, %k0
     88 ; X64-NEXT:    kmovw %k0, %eax
     89 ; X64-NEXT:    cmpw $-1, %ax
     90 ; X64-NEXT:    sete %al
     91 ; X64-NEXT:    andb $1, %al
     92 ; X64-NEXT:    movzbl %al, %eax
     93 ; X64-NEXT:    vzeroupper
     94 ; X64-NEXT:    retq
     95 entry:
     96   %0 = bitcast <8 x i64> %__A to <16 x i32>
     97   %1 = bitcast <8 x i64> %__B to <16 x i32>
     98   %2 = icmp ne <16 x i32> %0, %1
     99   %3 = bitcast <8 x i64> %__C to <16 x i32>
    100   %4 = bitcast <8 x i64> %__D to <16 x i32>
    101   %5 = icmp ne <16 x i32> %3, %4
    102   %6 = or <16 x i1> %5, %2                                                                                                                                                                                                                                 %7 = bitcast <16 x i1> %6 to i16
    103   %8 = icmp eq i16 %7, -1
    104   %9 = zext i1 %8 to i32
    105   ret i32 %9
    106 }
    107 
    108 define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
    109 ; X86-LABEL: test_mm512_kortestz:
    110 ; X86:       # %bb.0: # %entry
    111 ; X86-NEXT:    pushl %ebp
    112 ; X86-NEXT:    .cfi_def_cfa_offset 8
    113 ; X86-NEXT:    .cfi_offset %ebp, -8
    114 ; X86-NEXT:    movl %esp, %ebp
    115 ; X86-NEXT:    .cfi_def_cfa_register %ebp
    116 ; X86-NEXT:    andl $-64, %esp
    117 ; X86-NEXT:    subl $64, %esp
    118 ; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
    119 ; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
    120 ; X86-NEXT:    korw %k0, %k1, %k0
    121 ; X86-NEXT:    kmovw %k0, %eax
    122 ; X86-NEXT:    cmpw $0, %ax
    123 ; X86-NEXT:    sete %al
    124 ; X86-NEXT:    andb $1, %al
    125 ; X86-NEXT:    movzbl %al, %eax
    126 ; X86-NEXT:    movl %ebp, %esp
    127 ; X86-NEXT:    popl %ebp
    128 ; X86-NEXT:    .cfi_def_cfa %esp, 4
    129 ; X86-NEXT:    vzeroupper
    130 ; X86-NEXT:    retl
    131 ;
    132 ; X64-LABEL: test_mm512_kortestz:
    133 ; X64:       # %bb.0: # %entry
    134 ; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
    135 ; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
    136 ; X64-NEXT:    korw %k0, %k1, %k0
    137 ; X64-NEXT:    kmovw %k0, %eax
    138 ; X64-NEXT:    cmpw $0, %ax
    139 ; X64-NEXT:    sete %al
    140 ; X64-NEXT:    andb $1, %al
    141 ; X64-NEXT:    movzbl %al, %eax
    142 ; X64-NEXT:    vzeroupper
    143 ; X64-NEXT:    retq
    144 entry:
    145   %0 = bitcast <8 x i64> %__A to <16 x i32>
    146   %1 = bitcast <8 x i64> %__B to <16 x i32>
    147   %2 = icmp ne <16 x i32> %0, %1
    148   %3 = bitcast <8 x i64> %__C to <16 x i32>
    149   %4 = bitcast <8 x i64> %__D to <16 x i32>
    150   %5 = icmp ne <16 x i32> %3, %4
    151   %6 = or <16 x i1> %5, %2
    152   %7 = bitcast <16 x i1> %6 to i16
    153   %8 = icmp eq i16 %7, 0
    154   %9 = zext i1 %8 to i32
    155   ret i32 %9
    156 }
    157 
    158 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
    159 ; CHECK-LABEL: test_mm512_shuffle_f32x4:
    160 ; CHECK:       # %bb.0: # %entry
    161 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
    162 ; CHECK-NEXT:    ret{{[l|q]}}
    163 entry:
    164   %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
    165   ret <16 x float> %shuffle
    166 }
    167 
    168 
    169 define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
    170 ; X86-LABEL: test_mm512_mask_shuffle_f32x4:
    171 ; X86:       # %bb.0: # %entry
    172 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    173 ; X86-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
    174 ; X86-NEXT:    retl
    175 ;
    176 ; X64-LABEL: test_mm512_mask_shuffle_f32x4:
    177 ; X64:       # %bb.0: # %entry
    178 ; X64-NEXT:    kmovw %edi, %k1
    179 ; X64-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
    180 ; X64-NEXT:    retq
    181 entry:
    182   %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
    183   %0 = bitcast i16 %__U to <16 x i1>
    184   %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
    185   ret <16 x float> %1
    186 }
    187 
    188 define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
    189 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
    190 ; X86:       # %bb.0: # %entry
    191 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    192 ; X86-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
    193 ; X86-NEXT:    retl
    194 ;
    195 ; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
    196 ; X64:       # %bb.0: # %entry
    197 ; X64-NEXT:    kmovw %edi, %k1
    198 ; X64-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
    199 ; X64-NEXT:    retq
    200 entry:
    201   %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
    202   %0 = bitcast i16 %__U to <16 x i1>
    203   %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
    204   ret <16 x float> %1
    205 }
    206 
    207 define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
    208 ; CHECK-LABEL: test_mm512_shuffle_f64x2:
    209 ; CHECK:       # %bb.0: # %entry
    210 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
    211 ; CHECK-NEXT:    ret{{[l|q]}}
    212 entry:
    213   %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    214   ret <8 x double> %shuffle
    215 }
    216 
    217 define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
    218 ; X86-LABEL: test_mm512_mask_shuffle_f64x2:
    219 ; X86:       # %bb.0: # %entry
    220 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    221 ; X86-NEXT:    kmovw %eax, %k1
    222 ; X86-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
    223 ; X86-NEXT:    retl
    224 ;
    225 ; X64-LABEL: test_mm512_mask_shuffle_f64x2:
    226 ; X64:       # %bb.0: # %entry
    227 ; X64-NEXT:    kmovw %edi, %k1
    228 ; X64-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
    229 ; X64-NEXT:    retq
    230 entry:
    231   %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    232   %0 = bitcast i8 %__U to <8 x i1>
    233   %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
    234   ret <8 x double> %1
    235 }
    236 
    237 define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
    238 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
    239 ; X86:       # %bb.0: # %entry
    240 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    241 ; X86-NEXT:    kmovw %eax, %k1
    242 ; X86-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
    243 ; X86-NEXT:    retl
    244 ;
    245 ; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
    246 ; X64:       # %bb.0: # %entry
    247 ; X64-NEXT:    kmovw %edi, %k1
    248 ; X64-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
    249 ; X64-NEXT:    retq
    250 entry:
    251   %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    252   %0 = bitcast i8 %__U to <8 x i1>
    253   %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
    254   ret <8 x double> %1
    255 }
    256 
    257 define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
    258 ; CHECK-LABEL: test_mm512_shuffle_i32x4:
    259 ; CHECK:       # %bb.0: # %entry
    260 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
    261 ; CHECK-NEXT:    ret{{[l|q]}}
    262 entry:
    263   %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    264   ret <8 x i64> %shuffle
    265 }
    266 
    267 define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
    268 ; X86-LABEL: test_mm512_mask_shuffle_i32x4:
    269 ; X86:       # %bb.0: # %entry
    270 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    271 ; X86-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
    272 ; X86-NEXT:    retl
    273 ;
    274 ; X64-LABEL: test_mm512_mask_shuffle_i32x4:
    275 ; X64:       # %bb.0: # %entry
    276 ; X64-NEXT:    kmovw %edi, %k1
    277 ; X64-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
    278 ; X64-NEXT:    retq
    279 entry:
    280   %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    281   %0 = bitcast <8 x i64> %shuffle to <16 x i32>
    282   %1 = bitcast <8 x i64> %__W to <16 x i32>
    283   %2 = bitcast i16 %__U to <16 x i1>
    284   %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
    285   %4 = bitcast <16 x i32> %3 to <8 x i64>
    286   ret <8 x i64> %4
    287 }
    288 
    289 define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
    290 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
    291 ; X86:       # %bb.0: # %entry
    292 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    293 ; X86-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
    294 ; X86-NEXT:    retl
    295 ;
    296 ; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
    297 ; X64:       # %bb.0: # %entry
    298 ; X64-NEXT:    kmovw %edi, %k1
    299 ; X64-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
    300 ; X64-NEXT:    retq
    301 entry:
    302   %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    303   %0 = bitcast <8 x i64> %shuffle to <16 x i32>
    304   %1 = bitcast i16 %__U to <16 x i1>
    305   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
    306   %3 = bitcast <16 x i32> %2 to <8 x i64>
    307   ret <8 x i64> %3
    308 }
    309 
    310 define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
    311 ; CHECK-LABEL: test_mm512_shuffle_i64x2:
    312 ; CHECK:       # %bb.0: # %entry
    313 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
    314 ; CHECK-NEXT:    ret{{[l|q]}}
    315 entry:
    316   %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    317   ret <8 x i64> %shuffle
    318 }
    319 
    320 define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
    321 ; X86-LABEL: test_mm512_mask_shuffle_i64x2:
    322 ; X86:       # %bb.0: # %entry
    323 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    324 ; X86-NEXT:    kmovw %eax, %k1
    325 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
    326 ; X86-NEXT:    retl
    327 ;
    328 ; X64-LABEL: test_mm512_mask_shuffle_i64x2:
    329 ; X64:       # %bb.0: # %entry
    330 ; X64-NEXT:    kmovw %edi, %k1
    331 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
    332 ; X64-NEXT:    retq
    333 entry:
    334   %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    335   %0 = bitcast i8 %__U to <8 x i1>
    336   %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
    337   ret <8 x i64> %1
    338 }
    339 
    340 define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
    341 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
    342 ; X86:       # %bb.0: # %entry
    343 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    344 ; X86-NEXT:    kmovw %eax, %k1
    345 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
    346 ; X86-NEXT:    retl
    347 ;
    348 ; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
    349 ; X64:       # %bb.0: # %entry
    350 ; X64-NEXT:    kmovw %edi, %k1
    351 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
    352 ; X64-NEXT:    retq
    353 entry:
    354   %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
    355   %0 = bitcast i8 %__U to <8 x i1>
    356   %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
    357   ret <8 x i64> %1
    358 }
    359 
    360 
    361 define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
    362 ; CHECK-LABEL: test_mm512_testn_epi32_mask:
    363 ; CHECK:       # %bb.0: # %entry
    364 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm1, %k0
    365 ; CHECK-NEXT:    kmovw %k0, %eax
    366 ; CHECK-NEXT:    movzwl %ax, %eax
    367 ; CHECK-NEXT:    vzeroupper
    368 ; CHECK-NEXT:    ret{{[l|q]}}
    369 entry:
    370   %and1.i.i = and <8 x i64> %__B, %__A
    371   %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
    372   %1 = icmp eq <16 x i32> %0, zeroinitializer
    373   %2 = bitcast <16 x i1> %1 to i16
    374   ret i16 %2
    375 }
    376 
    377 define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
    378 ; X86-LABEL: test_mm512_mask_testn_epi32_mask:
    379 ; X86:       # %bb.0: # %entry
    380 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    381 ; X86-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
    382 ; X86-NEXT:    kmovw %k0, %eax
    383 ; X86-NEXT:    movzwl %ax, %eax
    384 ; X86-NEXT:    vzeroupper
    385 ; X86-NEXT:    retl
    386 ;
    387 ; X64-LABEL: test_mm512_mask_testn_epi32_mask:
    388 ; X64:       # %bb.0: # %entry
    389 ; X64-NEXT:    kmovw %edi, %k1
    390 ; X64-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
    391 ; X64-NEXT:    kmovw %k0, %eax
    392 ; X64-NEXT:    movzwl %ax, %eax
    393 ; X64-NEXT:    vzeroupper
    394 ; X64-NEXT:    retq
    395 entry:
    396   %and1.i.i = and <8 x i64> %__B, %__A
    397   %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
    398   %1 = icmp eq <16 x i32> %0, zeroinitializer
    399   %2 = bitcast i16 %__U to <16 x i1>
    400   %3 = and <16 x i1> %1, %2
    401   %4 = bitcast <16 x i1> %3 to i16
    402   ret i16 %4
    403 }
    404 
    405 define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
    406 ; CHECK-LABEL: test_mm512_testn_epi64_mask:
    407 ; CHECK:       # %bb.0: # %entry
    408 ; CHECK-NEXT:    vptestnmq %zmm0, %zmm1, %k0
    409 ; CHECK-NEXT:    kmovw %k0, %eax
    410 ; CHECK-NEXT:    movzbl %al, %eax
    411 ; CHECK-NEXT:    vzeroupper
    412 ; CHECK-NEXT:    ret{{[l|q]}}
    413 entry:
    414   %and1.i.i = and <8 x i64> %__B, %__A
    415   %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
    416   %1 = bitcast <8 x i1> %0 to i8
    417   ret i8 %1
    418 }
    419 
    420 define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
    421 ; X86-LABEL: test_mm512_mask_testn_epi64_mask:
    422 ; X86:       # %bb.0: # %entry
    423 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    424 ; X86-NEXT:    kmovw %eax, %k1
    425 ; X86-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
    426 ; X86-NEXT:    kmovw %k0, %eax
    427 ; X86-NEXT:    movzbl %al, %eax
    428 ; X86-NEXT:    vzeroupper
    429 ; X86-NEXT:    retl
    430 ;
    431 ; X64-LABEL: test_mm512_mask_testn_epi64_mask:
    432 ; X64:       # %bb.0: # %entry
    433 ; X64-NEXT:    kmovw %edi, %k1
    434 ; X64-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
    435 ; X64-NEXT:    kmovw %k0, %eax
    436 ; X64-NEXT:    movzbl %al, %eax
    437 ; X64-NEXT:    vzeroupper
    438 ; X64-NEXT:    retq
    439 entry:
    440   %and1.i.i = and <8 x i64> %__B, %__A
    441   %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
    442   %1 = bitcast i8 %__U to <8 x i1>
    443   %2 = and <8 x i1> %0, %1
    444   %3 = bitcast <8 x i1> %2 to i8
    445   ret i8 %3
    446 }
    447 
    448 define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
    449 ; X86-LABEL: test_mm512_mask_test_epi32_mask:
    450 ; X86:       # %bb.0: # %entry
    451 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    452 ; X86-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
    453 ; X86-NEXT:    kmovw %k0, %eax
    454 ; X86-NEXT:    movzwl %ax, %eax
    455 ; X86-NEXT:    vzeroupper
    456 ; X86-NEXT:    retl
    457 ;
    458 ; X64-LABEL: test_mm512_mask_test_epi32_mask:
    459 ; X64:       # %bb.0: # %entry
    460 ; X64-NEXT:    kmovw %edi, %k1
    461 ; X64-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
    462 ; X64-NEXT:    kmovw %k0, %eax
    463 ; X64-NEXT:    movzwl %ax, %eax
    464 ; X64-NEXT:    vzeroupper
    465 ; X64-NEXT:    retq
    466 entry:
    467   %and1.i.i = and <8 x i64> %__B, %__A
    468   %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
    469   %1 = icmp ne <16 x i32> %0, zeroinitializer
    470   %2 = bitcast i16 %__U to <16 x i1>
    471   %3 = and <16 x i1> %1, %2
    472   %4 = bitcast <16 x i1> %3 to i16
    473   ret i16 %4
    474 }
    475 
    476 define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
    477 ; X86-LABEL: test_mm512_mask_test_epi64_mask:
    478 ; X86:       # %bb.0: # %entry
    479 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    480 ; X86-NEXT:    kmovw %eax, %k1
    481 ; X86-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
    482 ; X86-NEXT:    kmovw %k0, %eax
    483 ; X86-NEXT:    movzbl %al, %eax
    484 ; X86-NEXT:    vzeroupper
    485 ; X86-NEXT:    retl
    486 ;
    487 ; X64-LABEL: test_mm512_mask_test_epi64_mask:
    488 ; X64:       # %bb.0: # %entry
    489 ; X64-NEXT:    kmovw %edi, %k1
    490 ; X64-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
    491 ; X64-NEXT:    kmovw %k0, %eax
    492 ; X64-NEXT:    movzbl %al, %eax
    493 ; X64-NEXT:    vzeroupper
    494 ; X64-NEXT:    retq
    495 entry:
    496   %and1.i.i = and <8 x i64> %__B, %__A
    497   %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
    498   %1 = bitcast i8 %__U to <8 x i1>
    499   %2 = and <8 x i1> %0, %1
    500   %3 = bitcast <8 x i1> %2 to i8
    501   ret i8 %3
    502 }
    503 
    504 define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
    505 ; X86-LABEL: test_mm512_mask_set1_epi32:
    506 ; X86:       # %bb.0: # %entry
    507 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    508 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    509 ; X86-NEXT:    vpbroadcastd %eax, %zmm0 {%k1}
    510 ; X86-NEXT:    retl
    511 ;
    512 ; X64-LABEL: test_mm512_mask_set1_epi32:
    513 ; X64:       # %bb.0: # %entry
    514 ; X64-NEXT:    kmovw %edi, %k1
    515 ; X64-NEXT:    vpbroadcastd %esi, %zmm0 {%k1}
    516 ; X64-NEXT:    retq
    517 entry:
    518   %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
    519   %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
    520   %0 = bitcast <8 x i64> %__O to <16 x i32>
    521   %1 = bitcast i16 %__M to <16 x i1>
    522   %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
    523   %3 = bitcast <16 x i32> %2 to <8 x i64>
    524   ret <8 x i64> %3
    525 }
    526 
    527 define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A)  {
    528 ; X86-LABEL: test_mm512_maskz_set1_epi32:
    529 ; X86:       # %bb.0: # %entry
    530 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    531 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    532 ; X86-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
    533 ; X86-NEXT:    retl
    534 ;
    535 ; X64-LABEL: test_mm512_maskz_set1_epi32:
    536 ; X64:       # %bb.0: # %entry
    537 ; X64-NEXT:    kmovw %edi, %k1
    538 ; X64-NEXT:    vpbroadcastd %esi, %zmm0 {%k1} {z}
    539 ; X64-NEXT:    retq
    540 entry:
    541   %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
    542   %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
    543   %0 = bitcast i16 %__M to <16 x i1>
    544   %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
    545   %2 = bitcast <16 x i32> %1 to <8 x i64>
    546   ret <8 x i64> %2
    547 }
    548 
    549 define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
    550 ; X86-LABEL: test_mm512_mask_set1_epi64:
    551 ; X86:       # %bb.0: # %entry
    552 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    553 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    554 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
    555 ; X86-NEXT:    kmovw %eax, %k1
    556 ; X86-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
    557 ; X86-NEXT:    retl
    558 ;
    559 ; X64-LABEL: test_mm512_mask_set1_epi64:
    560 ; X64:       # %bb.0: # %entry
    561 ; X64-NEXT:    kmovw %edi, %k1
    562 ; X64-NEXT:    vpbroadcastq %rsi, %zmm0 {%k1}
    563 ; X64-NEXT:    retq
    564 entry:
    565   %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
    566   %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
    567   %0 = bitcast i8 %__M to <8 x i1>
    568   %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
    569   ret <8 x i64> %1
    570 }
    571 
    572 define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
    573 ; X86-LABEL: test_mm512_maskz_set1_epi64:
    574 ; X86:       # %bb.0: # %entry
    575 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    576 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    577 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
    578 ; X86-NEXT:    kmovw %eax, %k1
    579 ; X86-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
    580 ; X86-NEXT:    retl
    581 ;
    582 ; X64-LABEL: test_mm512_maskz_set1_epi64:
    583 ; X64:       # %bb.0: # %entry
    584 ; X64-NEXT:    kmovw %edi, %k1
    585 ; X64-NEXT:    vpbroadcastq %rsi, %zmm0 {%k1} {z}
    586 ; X64-NEXT:    retq
    587 entry:
    588   %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
    589   %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
    590   %0 = bitcast i8 %__M to <8 x i1>
    591   %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
    592   ret <8 x i64> %1
    593 }
    594 
    595 
    596 define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
    597 ; CHECK-LABEL: test_mm512_broadcastd_epi32:
    598 ; CHECK:       # %bb.0:
    599 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
    600 ; CHECK-NEXT:    ret{{[l|q]}}
    601   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    602   %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
    603   %res1 = bitcast <16 x i32> %res0 to <8 x i64>
    604   ret <8 x i64> %res1
    605 }
    606 
    607 define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
    608 ; X86-LABEL: test_mm512_mask_broadcastd_epi32:
    609 ; X86:       # %bb.0:
    610 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    611 ; X86-NEXT:    vpbroadcastd %xmm1, %zmm0 {%k1}
    612 ; X86-NEXT:    retl
    613 ;
    614 ; X64-LABEL: test_mm512_mask_broadcastd_epi32:
    615 ; X64:       # %bb.0:
    616 ; X64-NEXT:    kmovw %edi, %k1
    617 ; X64-NEXT:    vpbroadcastd %xmm1, %zmm0 {%k1}
    618 ; X64-NEXT:    retq
    619   %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
    620   %arg1 = bitcast i16 %a1 to <16 x i1>
    621   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
    622   %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
    623   %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
    624   %res2 = bitcast <16 x i32> %res1 to <8 x i64>
    625   ret <8 x i64> %res2
    626 }
    627 
    628 define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
    629 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
    630 ; X86:       # %bb.0:
    631 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    632 ; X86-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
    633 ; X86-NEXT:    retl
    634 ;
    635 ; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
    636 ; X64:       # %bb.0:
    637 ; X64-NEXT:    kmovw %edi, %k1
    638 ; X64-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
    639 ; X64-NEXT:    retq
    640   %arg0 = bitcast i16 %a0 to <16 x i1>
    641   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    642   %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
    643   %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
    644   %res2 = bitcast <16 x i32> %res1 to <8 x i64>
    645   ret <8 x i64> %res2
    646 }
    647 
    648 define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
    649 ; CHECK-LABEL: test_mm512_broadcastq_epi64:
    650 ; CHECK:       # %bb.0:
    651 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
    652 ; CHECK-NEXT:    ret{{[l|q]}}
    653   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
    654   ret <8 x i64> %res
    655 }
    656 
    657 define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
    658 ; X86-LABEL: test_mm512_mask_broadcastq_epi64:
    659 ; X86:       # %bb.0:
    660 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    661 ; X86-NEXT:    kmovw %eax, %k1
    662 ; X86-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
    663 ; X86-NEXT:    retl
    664 ;
    665 ; X64-LABEL: test_mm512_mask_broadcastq_epi64:
    666 ; X64:       # %bb.0:
    667 ; X64-NEXT:    kmovw %edi, %k1
    668 ; X64-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
    669 ; X64-NEXT:    retq
    670   %arg1 = bitcast i8 %a1 to <8 x i1>
    671   %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
    672   %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
    673   ret <8 x i64> %res1
    674 }
    675 
    676 define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
    677 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
    678 ; X86:       # %bb.0:
    679 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    680 ; X86-NEXT:    kmovw %eax, %k1
    681 ; X86-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
    682 ; X86-NEXT:    retl
    683 ;
    684 ; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
    685 ; X64:       # %bb.0:
    686 ; X64-NEXT:    kmovw %edi, %k1
    687 ; X64-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
    688 ; X64-NEXT:    retq
    689   %arg0 = bitcast i8 %a0 to <8 x i1>
    690   %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
    691   %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
    692   ret <8 x i64> %res1
    693 }
    694 
    695 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
    696 ; CHECK-LABEL: test_mm512_broadcastsd_pd:
    697 ; CHECK:       # %bb.0:
    698 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
    699 ; CHECK-NEXT:    ret{{[l|q]}}
    700   %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
    701   ret <8 x double> %res
    702 }
    703 
    704 define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
    705 ; X86-LABEL: test_mm512_mask_broadcastsd_pd:
    706 ; X86:       # %bb.0:
    707 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    708 ; X86-NEXT:    kmovw %eax, %k1
    709 ; X86-NEXT:    vbroadcastsd %xmm1, %zmm0 {%k1}
    710 ; X86-NEXT:    retl
    711 ;
    712 ; X64-LABEL: test_mm512_mask_broadcastsd_pd:
    713 ; X64:       # %bb.0:
    714 ; X64-NEXT:    kmovw %edi, %k1
    715 ; X64-NEXT:    vbroadcastsd %xmm1, %zmm0 {%k1}
    716 ; X64-NEXT:    retq
    717   %arg1 = bitcast i8 %a1 to <8 x i1>
    718   %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
    719   %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
    720   ret <8 x double> %res1
    721 }
    722 
    723 define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
    724 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
    725 ; X86:       # %bb.0:
    726 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    727 ; X86-NEXT:    kmovw %eax, %k1
    728 ; X86-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
    729 ; X86-NEXT:    retl
    730 ;
    731 ; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
    732 ; X64:       # %bb.0:
    733 ; X64-NEXT:    kmovw %edi, %k1
    734 ; X64-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
    735 ; X64-NEXT:    retq
    736   %arg0 = bitcast i8 %a0 to <8 x i1>
    737   %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
    738   %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
    739   ret <8 x double> %res1
    740 }
    741 
    742 define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
    743 ; CHECK-LABEL: test_mm512_broadcastss_ps:
    744 ; CHECK:       # %bb.0:
    745 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
    746 ; CHECK-NEXT:    ret{{[l|q]}}
    747   %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
    748   ret <16 x float> %res
    749 }
    750 
    751 define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
    752 ; X86-LABEL: test_mm512_mask_broadcastss_ps:
    753 ; X86:       # %bb.0:
    754 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    755 ; X86-NEXT:    vbroadcastss %xmm1, %zmm0 {%k1}
    756 ; X86-NEXT:    retl
    757 ;
    758 ; X64-LABEL: test_mm512_mask_broadcastss_ps:
    759 ; X64:       # %bb.0:
    760 ; X64-NEXT:    kmovw %edi, %k1
    761 ; X64-NEXT:    vbroadcastss %xmm1, %zmm0 {%k1}
    762 ; X64-NEXT:    retq
    763   %arg1 = bitcast i16 %a1 to <16 x i1>
    764   %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
    765   %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
    766   ret <16 x float> %res1
    767 }
    768 
    769 define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
    770 ; X86-LABEL: test_mm512_maskz_broadcastss_ps:
    771 ; X86:       # %bb.0:
    772 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    773 ; X86-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
    774 ; X86-NEXT:    retl
    775 ;
    776 ; X64-LABEL: test_mm512_maskz_broadcastss_ps:
    777 ; X64:       # %bb.0:
    778 ; X64-NEXT:    kmovw %edi, %k1
    779 ; X64-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
    780 ; X64-NEXT:    retq
    781   %arg0 = bitcast i16 %a0 to <16 x i1>
    782   %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
    783   %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
    784   ret <16 x float> %res1
    785 }
    786 
    787 define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
    788 ; CHECK-LABEL: test_mm512_movedup_pd:
    789 ; CHECK:       # %bb.0:
    790 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
    791 ; CHECK-NEXT:    ret{{[l|q]}}
    792   %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    793   ret <8 x double> %res
    794 }
    795 
    796 define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
    797 ; X86-LABEL: test_mm512_mask_movedup_pd:
    798 ; X86:       # %bb.0:
    799 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    800 ; X86-NEXT:    kmovw %eax, %k1
    801 ; X86-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
    802 ; X86-NEXT:    retl
    803 ;
    804 ; X64-LABEL: test_mm512_mask_movedup_pd:
    805 ; X64:       # %bb.0:
    806 ; X64-NEXT:    kmovw %edi, %k1
    807 ; X64-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
    808 ; X64-NEXT:    retq
    809   %arg1 = bitcast i8 %a1 to <8 x i1>
    810   %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    811   %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
    812   ret <8 x double> %res1
    813 }
    814 
    815 define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
    816 ; X86-LABEL: test_mm512_maskz_movedup_pd:
    817 ; X86:       # %bb.0:
    818 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    819 ; X86-NEXT:    kmovw %eax, %k1
    820 ; X86-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
    821 ; X86-NEXT:    retl
    822 ;
    823 ; X64-LABEL: test_mm512_maskz_movedup_pd:
    824 ; X64:       # %bb.0:
    825 ; X64-NEXT:    kmovw %edi, %k1
    826 ; X64-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
    827 ; X64-NEXT:    retq
    828   %arg0 = bitcast i8 %a0 to <8 x i1>
    829   %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    830   %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
    831   ret <8 x double> %res1
    832 }
    833 
    834 define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
    835 ; CHECK-LABEL: test_mm512_movehdup_ps:
    836 ; CHECK:       # %bb.0:
    837 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    838 ; CHECK-NEXT:    ret{{[l|q]}}
    839   %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
    840   ret <16 x float> %res
    841 }
    842 
    843 define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
    844 ; X86-LABEL: test_mm512_mask_movehdup_ps:
    845 ; X86:       # %bb.0:
    846 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    847 ; X86-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    848 ; X86-NEXT:    retl
    849 ;
    850 ; X64-LABEL: test_mm512_mask_movehdup_ps:
    851 ; X64:       # %bb.0:
    852 ; X64-NEXT:    kmovw %edi, %k1
    853 ; X64-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    854 ; X64-NEXT:    retq
    855   %arg1 = bitcast i16 %a1 to <16 x i1>
    856   %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
    857   %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
    858   ret <16 x float> %res1
    859 }
    860 
    861 define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
    862 ; X86-LABEL: test_mm512_maskz_movehdup_ps:
    863 ; X86:       # %bb.0:
    864 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    865 ; X86-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    866 ; X86-NEXT:    retl
    867 ;
    868 ; X64-LABEL: test_mm512_maskz_movehdup_ps:
    869 ; X64:       # %bb.0:
    870 ; X64-NEXT:    kmovw %edi, %k1
    871 ; X64-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
    872 ; X64-NEXT:    retq
    873   %arg0 = bitcast i16 %a0 to <16 x i1>
    874   %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
    875   %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
    876   ret <16 x float> %res1
    877 }
    878 
    879 define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
    880 ; CHECK-LABEL: test_mm512_moveldup_ps:
    881 ; CHECK:       # %bb.0:
    882 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
    883 ; CHECK-NEXT:    ret{{[l|q]}}
    884   %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
    885   ret <16 x float> %res
    886 }
    887 
    888 define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
    889 ; X86-LABEL: test_mm512_mask_moveldup_ps:
    890 ; X86:       # %bb.0:
    891 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    892 ; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
    893 ; X86-NEXT:    retl
    894 ;
    895 ; X64-LABEL: test_mm512_mask_moveldup_ps:
    896 ; X64:       # %bb.0:
    897 ; X64-NEXT:    kmovw %edi, %k1
    898 ; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
    899 ; X64-NEXT:    retq
    900   %arg1 = bitcast i16 %a1 to <16 x i1>
    901   %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
    902   %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
    903   ret <16 x float> %res1
    904 }
    905 
    906 define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
    907 ; X86-LABEL: test_mm512_maskz_moveldup_ps:
    908 ; X86:       # %bb.0:
    909 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    910 ; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
    911 ; X86-NEXT:    retl
    912 ;
    913 ; X64-LABEL: test_mm512_maskz_moveldup_ps:
    914 ; X64:       # %bb.0:
    915 ; X64-NEXT:    kmovw %edi, %k1
    916 ; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
    917 ; X64-NEXT:    retq
    918   %arg0 = bitcast i16 %a0 to <16 x i1>
    919   %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
    920   %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
    921   ret <16 x float> %res1
    922 }
    923 
    924 define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
    925 ; CHECK-LABEL: test_mm512_permute_pd:
    926 ; CHECK:       # %bb.0:
    927 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
    928 ; CHECK-NEXT:    ret{{[l|q]}}
    929   %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    930   ret <8 x double> %res
    931 }
    932 
    933 define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
    934 ; X86-LABEL: test_mm512_mask_permute_pd:
    935 ; X86:       # %bb.0:
    936 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    937 ; X86-NEXT:    kmovw %eax, %k1
    938 ; X86-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
    939 ; X86-NEXT:    retl
    940 ;
    941 ; X64-LABEL: test_mm512_mask_permute_pd:
    942 ; X64:       # %bb.0:
    943 ; X64-NEXT:    kmovw %edi, %k1
    944 ; X64-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
    945 ; X64-NEXT:    retq
    946   %arg1 = bitcast i8 %a1 to <8 x i1>
    947   %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    948   %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
    949   ret <8 x double> %res1
    950 }
    951 
    952 define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
    953 ; X86-LABEL: test_mm512_maskz_permute_pd:
    954 ; X86:       # %bb.0:
    955 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
    956 ; X86-NEXT:    kmovw %eax, %k1
    957 ; X86-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
    958 ; X86-NEXT:    retl
    959 ;
    960 ; X64-LABEL: test_mm512_maskz_permute_pd:
    961 ; X64:       # %bb.0:
    962 ; X64-NEXT:    kmovw %edi, %k1
    963 ; X64-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
    964 ; X64-NEXT:    retq
    965   %arg0 = bitcast i8 %a0 to <8 x i1>
    966   %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    967   %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
    968   ret <8 x double> %res1
    969 }
    970 
    971 define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
    972 ; CHECK-LABEL: test_mm512_permute_ps:
    973 ; CHECK:       # %bb.0:
    974 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
    975 ; CHECK-NEXT:    ret{{[l|q]}}
    976   %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
    977   ret <16 x float> %res
    978 }
    979 
    980 define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
    981 ; X86-LABEL: test_mm512_mask_permute_ps:
    982 ; X86:       # %bb.0:
    983 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    984 ; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
    985 ; X86-NEXT:    retl
    986 ;
    987 ; X64-LABEL: test_mm512_mask_permute_ps:
    988 ; X64:       # %bb.0:
    989 ; X64-NEXT:    kmovw %edi, %k1
    990 ; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
    991 ; X64-NEXT:    retq
    992   %arg1 = bitcast i16 %a1 to <16 x i1>
    993   %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
    994   %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
    995   ret <16 x float> %res1
    996 }
    997 
    998 define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
    999 ; X86-LABEL: test_mm512_maskz_permute_ps:
   1000 ; X86:       # %bb.0:
   1001 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1002 ; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
   1003 ; X86-NEXT:    retl
   1004 ;
   1005 ; X64-LABEL: test_mm512_maskz_permute_ps:
   1006 ; X64:       # %bb.0:
   1007 ; X64-NEXT:    kmovw %edi, %k1
   1008 ; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
   1009 ; X64-NEXT:    retq
   1010   %arg0 = bitcast i16 %a0 to <16 x i1>
   1011   %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
   1012   %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
   1013   ret <16 x float> %res1
   1014 }
   1015 
   1016 define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
   1017 ; CHECK-LABEL: test_mm512_permutex_epi64:
   1018 ; CHECK:       # %bb.0:
   1019 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
   1020 ; CHECK-NEXT:    ret{{[l|q]}}
   1021   %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   1022   ret <8 x i64> %res
   1023 }
   1024 
   1025 define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
   1026 ; X86-LABEL: test_mm512_mask_permutex_epi64:
   1027 ; X86:       # %bb.0:
   1028 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1029 ; X86-NEXT:    kmovw %eax, %k1
   1030 ; X86-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
   1031 ; X86-NEXT:    retl
   1032 ;
   1033 ; X64-LABEL: test_mm512_mask_permutex_epi64:
   1034 ; X64:       # %bb.0:
   1035 ; X64-NEXT:    kmovw %edi, %k1
   1036 ; X64-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
   1037 ; X64-NEXT:    retq
   1038   %arg1 = bitcast i8 %a1 to <8 x i1>
   1039   %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   1040   %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
   1041   ret <8 x i64> %res1
   1042 }
   1043 
   1044 define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
   1045 ; X86-LABEL: test_mm512_maskz_permutex_epi64:
   1046 ; X86:       # %bb.0:
   1047 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1048 ; X86-NEXT:    kmovw %eax, %k1
   1049 ; X86-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
   1050 ; X86-NEXT:    retl
   1051 ;
   1052 ; X64-LABEL: test_mm512_maskz_permutex_epi64:
   1053 ; X64:       # %bb.0:
   1054 ; X64-NEXT:    kmovw %edi, %k1
   1055 ; X64-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
   1056 ; X64-NEXT:    retq
   1057   %arg0 = bitcast i8 %a0 to <8 x i1>
   1058   %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   1059   %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
   1060   ret <8 x i64> %res1
   1061 }
   1062 
   1063 define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
   1064 ; CHECK-LABEL: test_mm512_permutex_pd:
   1065 ; CHECK:       # %bb.0:
   1066 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
   1067 ; CHECK-NEXT:    ret{{[l|q]}}
   1068   %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   1069   ret <8 x double> %res
   1070 }
   1071 
   1072 define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
   1073 ; X86-LABEL: test_mm512_mask_permutex_pd:
   1074 ; X86:       # %bb.0:
   1075 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1076 ; X86-NEXT:    kmovw %eax, %k1
   1077 ; X86-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
   1078 ; X86-NEXT:    retl
   1079 ;
   1080 ; X64-LABEL: test_mm512_mask_permutex_pd:
   1081 ; X64:       # %bb.0:
   1082 ; X64-NEXT:    kmovw %edi, %k1
   1083 ; X64-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
   1084 ; X64-NEXT:    retq
   1085   %arg1 = bitcast i8 %a1 to <8 x i1>
   1086   %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   1087   %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
   1088   ret <8 x double> %res1
   1089 }
   1090 
   1091 define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
   1092 ; X86-LABEL: test_mm512_maskz_permutex_pd:
   1093 ; X86:       # %bb.0:
   1094 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1095 ; X86-NEXT:    kmovw %eax, %k1
   1096 ; X86-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
   1097 ; X86-NEXT:    retl
   1098 ;
   1099 ; X64-LABEL: test_mm512_maskz_permutex_pd:
   1100 ; X64:       # %bb.0:
   1101 ; X64-NEXT:    kmovw %edi, %k1
   1102 ; X64-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
   1103 ; X64-NEXT:    retq
   1104   %arg0 = bitcast i8 %a0 to <8 x i1>
   1105   %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   1106   %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
   1107   ret <8 x double> %res1
   1108 }
   1109 
   1110 define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
   1111 ; CHECK-LABEL: test_mm512_shuffle_epi32:
   1112 ; CHECK:       # %bb.0:
   1113 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
   1114 ; CHECK-NEXT:    ret{{[l|q]}}
   1115   %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
   1116   %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   1117   %res1 = bitcast <16 x i32> %res0 to <8 x i64>
   1118   ret <8 x i64> %res1
   1119 }
   1120 
   1121 define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
   1122 ; X86-LABEL: test_mm512_mask_shuffle_epi32:
   1123 ; X86:       # %bb.0:
   1124 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1125 ; X86-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
   1126 ; X86-NEXT:    retl
   1127 ;
   1128 ; X64-LABEL: test_mm512_mask_shuffle_epi32:
   1129 ; X64:       # %bb.0:
   1130 ; X64-NEXT:    kmovw %edi, %k1
   1131 ; X64-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
   1132 ; X64-NEXT:    retq
   1133   %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
   1134   %arg1 = bitcast i16 %a1 to <16 x i1>
   1135   %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
   1136   %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   1137   %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
   1138   %res2 = bitcast <16 x i32> %res1 to <8 x i64>
   1139   ret <8 x i64> %res2
   1140 }
   1141 
   1142 define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
   1143 ; X86-LABEL: test_mm512_maskz_shuffle_epi32:
   1144 ; X86:       # %bb.0:
   1145 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1146 ; X86-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
   1147 ; X86-NEXT:    retl
   1148 ;
   1149 ; X64-LABEL: test_mm512_maskz_shuffle_epi32:
   1150 ; X64:       # %bb.0:
   1151 ; X64-NEXT:    kmovw %edi, %k1
   1152 ; X64-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
   1153 ; X64-NEXT:    retq
   1154   %arg0 = bitcast i16 %a0 to <16 x i1>
   1155   %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
   1156   %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   1157   %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
   1158   %res2 = bitcast <16 x i32> %res1 to <8 x i64>
   1159   ret <8 x i64> %res2
   1160 }
   1161 
   1162 define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
   1163 ; CHECK-LABEL: test_mm512_shuffle_pd:
   1164 ; CHECK:       # %bb.0:
   1165 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1166 ; CHECK-NEXT:    ret{{[l|q]}}
   1167   %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
   1168   ret <8 x double> %res
   1169 }
   1170 
   1171 define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
   1172 ; X86-LABEL: test_mm512_mask_shuffle_pd:
   1173 ; X86:       # %bb.0:
   1174 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1175 ; X86-NEXT:    kmovw %eax, %k1
   1176 ; X86-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
   1177 ; X86-NEXT:    retl
   1178 ;
   1179 ; X64-LABEL: test_mm512_mask_shuffle_pd:
   1180 ; X64:       # %bb.0:
   1181 ; X64-NEXT:    kmovw %edi, %k1
   1182 ; X64-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
   1183 ; X64-NEXT:    retq
   1184   %arg1 = bitcast i8 %a1 to <8 x i1>
   1185   %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
   1186   %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
   1187   ret <8 x double> %res1
   1188 }
   1189 
   1190 define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
   1191 ; X86-LABEL: test_mm512_maskz_shuffle_pd:
   1192 ; X86:       # %bb.0:
   1193 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1194 ; X86-NEXT:    kmovw %eax, %k1
   1195 ; X86-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1196 ; X86-NEXT:    retl
   1197 ;
   1198 ; X64-LABEL: test_mm512_maskz_shuffle_pd:
   1199 ; X64:       # %bb.0:
   1200 ; X64-NEXT:    kmovw %edi, %k1
   1201 ; X64-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1202 ; X64-NEXT:    retq
   1203   %arg0 = bitcast i8 %a0 to <8 x i1>
   1204   %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
   1205   %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
   1206   ret <8 x double> %res1
   1207 }
   1208 
   1209 define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
   1210 ; CHECK-LABEL: test_mm512_unpackhi_epi32:
   1211 ; CHECK:       # %bb.0:
   1212 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
   1213 ; CHECK-NEXT:    ret{{[l|q]}}
   1214   %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
   1215   %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
   1216   %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   1217   %res1 = bitcast <16 x i32> %res0 to <8 x i64>
   1218   ret <8 x i64> %res1
   1219 }
   1220 
   1221 define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
   1222 ; X86-LABEL: test_mm512_mask_unpackhi_epi32:
   1223 ; X86:       # %bb.0:
   1224 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1225 ; X86-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
   1226 ; X86-NEXT:    retl
   1227 ;
   1228 ; X64-LABEL: test_mm512_mask_unpackhi_epi32:
   1229 ; X64:       # %bb.0:
   1230 ; X64-NEXT:    kmovw %edi, %k1
   1231 ; X64-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
   1232 ; X64-NEXT:    retq
   1233   %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
   1234   %arg1 = bitcast i16 %a1 to <16 x i1>
   1235   %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
   1236   %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
   1237   %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   1238   %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
   1239   %res2 = bitcast <16 x i32> %res1 to <8 x i64>
   1240   ret <8 x i64> %res2
   1241 }
   1242 
   1243 define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
   1244 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
   1245 ; X86:       # %bb.0:
   1246 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1247 ; X86-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
   1248 ; X86-NEXT:    retl
   1249 ;
   1250 ; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
   1251 ; X64:       # %bb.0:
   1252 ; X64-NEXT:    kmovw %edi, %k1
   1253 ; X64-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
   1254 ; X64-NEXT:    retq
   1255   %arg0 = bitcast i16 %a0 to <16 x i1>
   1256   %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
   1257   %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
   1258   %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   1259   %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
   1260   %res2 = bitcast <16 x i32> %res1 to <8 x i64>
   1261   ret <8 x i64> %res2
   1262 }
   1263 
   1264 define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
   1265 ; CHECK-LABEL: test_mm512_unpackhi_epi64:
   1266 ; CHECK:       # %bb.0:
   1267 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
   1268 ; CHECK-NEXT:    ret{{[l|q]}}
   1269   %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   1270   ret <8 x i64> %res
   1271 }
   1272 
   1273 define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
   1274 ; X86-LABEL: test_mm512_mask_unpackhi_epi64:
   1275 ; X86:       # %bb.0:
   1276 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1277 ; X86-NEXT:    kmovw %eax, %k1
   1278 ; X86-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
   1279 ; X86-NEXT:    retl
   1280 ;
   1281 ; X64-LABEL: test_mm512_mask_unpackhi_epi64:
   1282 ; X64:       # %bb.0:
   1283 ; X64-NEXT:    kmovw %edi, %k1
   1284 ; X64-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
   1285 ; X64-NEXT:    retq
   1286   %arg1 = bitcast i8 %a1 to <8 x i1>
   1287   %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   1288   %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
   1289   ret <8 x i64> %res1
   1290 }
   1291 
   1292 define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
   1293 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
   1294 ; X86:       # %bb.0:
   1295 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1296 ; X86-NEXT:    kmovw %eax, %k1
   1297 ; X86-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
   1298 ; X86-NEXT:    retl
   1299 ;
   1300 ; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
   1301 ; X64:       # %bb.0:
   1302 ; X64-NEXT:    kmovw %edi, %k1
   1303 ; X64-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
   1304 ; X64-NEXT:    retq
   1305   %arg0 = bitcast i8 %a0 to <8 x i1>
   1306   %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   1307   %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
   1308   ret <8 x i64> %res1
   1309 }
   1310 
   1311 define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
   1312 ; CHECK-LABEL: test_mm512_unpackhi_pd:
   1313 ; CHECK:       # %bb.0:
   1314 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
   1315 ; CHECK-NEXT:    ret{{[l|q]}}
   1316   %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   1317   ret <8 x double> %res
   1318 }
   1319 
   1320 define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
   1321 ; X86-LABEL: test_mm512_mask_unpackhi_pd:
   1322 ; X86:       # %bb.0:
   1323 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1324 ; X86-NEXT:    kmovw %eax, %k1
   1325 ; X86-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
   1326 ; X86-NEXT:    retl
   1327 ;
   1328 ; X64-LABEL: test_mm512_mask_unpackhi_pd:
   1329 ; X64:       # %bb.0:
   1330 ; X64-NEXT:    kmovw %edi, %k1
   1331 ; X64-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
   1332 ; X64-NEXT:    retq
   1333   %arg1 = bitcast i8 %a1 to <8 x i1>
   1334   %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   1335   %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
   1336   ret <8 x double> %res1
   1337 }
   1338 
   1339 define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
   1340 ; X86-LABEL: test_mm512_maskz_unpackhi_pd:
   1341 ; X86:       # %bb.0:
   1342 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1343 ; X86-NEXT:    kmovw %eax, %k1
   1344 ; X86-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
   1345 ; X86-NEXT:    retl
   1346 ;
   1347 ; X64-LABEL: test_mm512_maskz_unpackhi_pd:
   1348 ; X64:       # %bb.0:
   1349 ; X64-NEXT:    kmovw %edi, %k1
   1350 ; X64-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
   1351 ; X64-NEXT:    retq
   1352   %arg0 = bitcast i8 %a0 to <8 x i1>
   1353   %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   1354   %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
   1355   ret <8 x double> %res1
   1356 }
   1357 
   1358 define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
   1359 ; CHECK-LABEL: test_mm512_unpackhi_ps:
   1360 ; CHECK:       # %bb.0:
   1361 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
   1362 ; CHECK-NEXT:    ret{{[l|q]}}
   1363   %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   1364   ret <16 x float> %res
   1365 }
   1366 
   1367 define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
   1368 ; X86-LABEL: test_mm512_mask_unpackhi_ps:
   1369 ; X86:       # %bb.0:
   1370 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1371 ; X86-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
   1372 ; X86-NEXT:    retl
   1373 ;
   1374 ; X64-LABEL: test_mm512_mask_unpackhi_ps:
   1375 ; X64:       # %bb.0:
   1376 ; X64-NEXT:    kmovw %edi, %k1
   1377 ; X64-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
   1378 ; X64-NEXT:    retq
   1379   %arg1 = bitcast i16 %a1 to <16 x i1>
   1380   %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   1381   %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
   1382   ret <16 x float> %res1
   1383 }
   1384 
   1385 define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
   1386 ; X86-LABEL: test_mm512_maskz_unpackhi_ps:
   1387 ; X86:       # %bb.0:
   1388 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1389 ; X86-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
   1390 ; X86-NEXT:    retl
   1391 ;
   1392 ; X64-LABEL: test_mm512_maskz_unpackhi_ps:
   1393 ; X64:       # %bb.0:
   1394 ; X64-NEXT:    kmovw %edi, %k1
   1395 ; X64-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
   1396 ; X64-NEXT:    retq
   1397   %arg0 = bitcast i16 %a0 to <16 x i1>
   1398   %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   1399   %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
   1400   ret <16 x float> %res1
   1401 }
   1402 
   1403 define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
   1404 ; CHECK-LABEL: test_mm512_unpacklo_epi32:
   1405 ; CHECK:       # %bb.0:
   1406 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
   1407 ; CHECK-NEXT:    ret{{[l|q]}}
   1408   %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
   1409   %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
   1410   %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   1411   %res1 = bitcast <16 x i32> %res0 to <8 x i64>
   1412   ret <8 x i64> %res1
   1413 }
   1414 
   1415 define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
   1416 ; X86-LABEL: test_mm512_mask_unpacklo_epi32:
   1417 ; X86:       # %bb.0:
   1418 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1419 ; X86-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
   1420 ; X86-NEXT:    retl
   1421 ;
   1422 ; X64-LABEL: test_mm512_mask_unpacklo_epi32:
   1423 ; X64:       # %bb.0:
   1424 ; X64-NEXT:    kmovw %edi, %k1
   1425 ; X64-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
   1426 ; X64-NEXT:    retq
   1427   %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
   1428   %arg1 = bitcast i16 %a1 to <16 x i1>
   1429   %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
   1430   %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
   1431   %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   1432   %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
   1433   %res2 = bitcast <16 x i32> %res1 to <8 x i64>
   1434   ret <8 x i64> %res2
   1435 }
   1436 
   1437 define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
   1438 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
   1439 ; X86:       # %bb.0:
   1440 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1441 ; X86-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
   1442 ; X86-NEXT:    retl
   1443 ;
   1444 ; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
   1445 ; X64:       # %bb.0:
   1446 ; X64-NEXT:    kmovw %edi, %k1
   1447 ; X64-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
   1448 ; X64-NEXT:    retq
   1449   %arg0 = bitcast i16 %a0 to <16 x i1>
   1450   %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
   1451   %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
   1452   %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   1453   %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
   1454   %res2 = bitcast <16 x i32> %res1 to <8 x i64>
   1455   ret <8 x i64> %res2
   1456 }
   1457 
   1458 define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
   1459 ; CHECK-LABEL: test_mm512_unpacklo_epi64:
   1460 ; CHECK:       # %bb.0:
   1461 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1462 ; CHECK-NEXT:    ret{{[l|q]}}
   1463   %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   1464   ret <8 x i64> %res
   1465 }
   1466 
   1467 define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
   1468 ; X86-LABEL: test_mm512_mask_unpacklo_epi64:
   1469 ; X86:       # %bb.0:
   1470 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1471 ; X86-NEXT:    kmovw %eax, %k1
   1472 ; X86-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
   1473 ; X86-NEXT:    retl
   1474 ;
   1475 ; X64-LABEL: test_mm512_mask_unpacklo_epi64:
   1476 ; X64:       # %bb.0:
   1477 ; X64-NEXT:    kmovw %edi, %k1
   1478 ; X64-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
   1479 ; X64-NEXT:    retq
   1480   %arg1 = bitcast i8 %a1 to <8 x i1>
   1481   %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   1482   %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
   1483   ret <8 x i64> %res1
   1484 }
   1485 
   1486 define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
   1487 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
   1488 ; X86:       # %bb.0:
   1489 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1490 ; X86-NEXT:    kmovw %eax, %k1
   1491 ; X86-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1492 ; X86-NEXT:    retl
   1493 ;
   1494 ; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
   1495 ; X64:       # %bb.0:
   1496 ; X64-NEXT:    kmovw %edi, %k1
   1497 ; X64-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1498 ; X64-NEXT:    retq
   1499   %arg0 = bitcast i8 %a0 to <8 x i1>
   1500   %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   1501   %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
   1502   ret <8 x i64> %res1
   1503 }
   1504 
   1505 define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
   1506 ; CHECK-LABEL: test_mm512_unpacklo_pd:
   1507 ; CHECK:       # %bb.0:
   1508 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1509 ; CHECK-NEXT:    ret{{[l|q]}}
   1510   %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   1511   ret <8 x double> %res
   1512 }
   1513 
   1514 define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
   1515 ; X86-LABEL: test_mm512_mask_unpacklo_pd:
   1516 ; X86:       # %bb.0:
   1517 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1518 ; X86-NEXT:    kmovw %eax, %k1
   1519 ; X86-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
   1520 ; X86-NEXT:    retl
   1521 ;
   1522 ; X64-LABEL: test_mm512_mask_unpacklo_pd:
   1523 ; X64:       # %bb.0:
   1524 ; X64-NEXT:    kmovw %edi, %k1
   1525 ; X64-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
   1526 ; X64-NEXT:    retq
   1527   %arg1 = bitcast i8 %a1 to <8 x i1>
   1528   %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   1529   %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
   1530   ret <8 x double> %res1
   1531 }
   1532 
   1533 define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
   1534 ; X86-LABEL: test_mm512_maskz_unpacklo_pd:
   1535 ; X86:       # %bb.0:
   1536 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1537 ; X86-NEXT:    kmovw %eax, %k1
   1538 ; X86-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1539 ; X86-NEXT:    retl
   1540 ;
   1541 ; X64-LABEL: test_mm512_maskz_unpacklo_pd:
   1542 ; X64:       # %bb.0:
   1543 ; X64-NEXT:    kmovw %edi, %k1
   1544 ; X64-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
   1545 ; X64-NEXT:    retq
   1546   %arg0 = bitcast i8 %a0 to <8 x i1>
   1547   %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   1548   %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
   1549   ret <8 x double> %res1
   1550 }
   1551 
   1552 define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
   1553 ; CHECK-LABEL: test_mm512_unpacklo_ps:
   1554 ; CHECK:       # %bb.0:
   1555 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
   1556 ; CHECK-NEXT:    ret{{[l|q]}}
   1557   %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   1558   ret <16 x float> %res
   1559 }
   1560 
   1561 define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
   1562 ; X86-LABEL: test_mm512_mask_unpacklo_ps:
   1563 ; X86:       # %bb.0:
   1564 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1565 ; X86-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
   1566 ; X86-NEXT:    retl
   1567 ;
   1568 ; X64-LABEL: test_mm512_mask_unpacklo_ps:
   1569 ; X64:       # %bb.0:
   1570 ; X64-NEXT:    kmovw %edi, %k1
   1571 ; X64-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
   1572 ; X64-NEXT:    retq
   1573   %arg1 = bitcast i16 %a1 to <16 x i1>
   1574   %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   1575   %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
   1576   ret <16 x float> %res1
   1577 }
   1578 
   1579 define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
   1580 ; X86-LABEL: test_mm512_maskz_unpacklo_ps:
   1581 ; X86:       # %bb.0:
   1582 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1583 ; X86-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
   1584 ; X86-NEXT:    retl
   1585 ;
   1586 ; X64-LABEL: test_mm512_maskz_unpacklo_ps:
   1587 ; X64:       # %bb.0:
   1588 ; X64-NEXT:    kmovw %edi, %k1
   1589 ; X64-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
   1590 ; X64-NEXT:    retq
   1591   %arg0 = bitcast i16 %a0 to <16 x i1>
   1592   %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   1593   %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
   1594   ret <16 x float> %res1
   1595 }
   1596 
   1597 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
   1598 ; CHECK-LABEL: test_mm512_zextpd128_pd512:
   1599 ; CHECK:       # %bb.0:
   1600 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
   1601 ; CHECK-NEXT:    ret{{[l|q]}}
   1602   %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   1603   ret <8 x double> %res
   1604 }
   1605 
   1606 define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
   1607 ; CHECK-LABEL: test_mm512_zextpd256_pd512:
   1608 ; CHECK:       # %bb.0:
   1609 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0
   1610 ; CHECK-NEXT:    ret{{[l|q]}}
   1611   %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1612   ret <8 x double> %res
   1613 }
   1614 
   1615 define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
   1616 ; CHECK-LABEL: test_mm512_zextps128_ps512:
   1617 ; CHECK:       # %bb.0:
   1618 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
   1619 ; CHECK-NEXT:    ret{{[l|q]}}
   1620   %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   1621   ret <16 x float> %res
   1622 }
   1623 
   1624 define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
   1625 ; CHECK-LABEL: test_mm512_zextps256_ps512:
   1626 ; CHECK:       # %bb.0:
   1627 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0
   1628 ; CHECK-NEXT:    ret{{[l|q]}}
   1629   %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1630   ret <16 x float> %res
   1631 }
   1632 
   1633 define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
   1634 ; CHECK-LABEL: test_mm512_zextsi128_si512:
   1635 ; CHECK:       # %bb.0:
   1636 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
   1637 ; CHECK-NEXT:    ret{{[l|q]}}
   1638   %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   1639   ret <8 x i64> %res
   1640 }
   1641 
   1642 define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
   1643 ; CHECK-LABEL: test_mm512_zextsi256_si512:
   1644 ; CHECK:       # %bb.0:
   1645 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0
   1646 ; CHECK-NEXT:    ret{{[l|q]}}
   1647   %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1648   ret <8 x i64> %res
   1649 }
   1650 
   1651 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
   1652 ; CHECK-LABEL: test_mm512_mul_epi32:
   1653 ; CHECK:       # %bb.0:
   1654 ; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
   1655 ; CHECK-NEXT:    vpsraq $32, %zmm0, %zmm0
   1656 ; CHECK-NEXT:    vpsllq $32, %zmm1, %zmm1
   1657 ; CHECK-NEXT:    vpsraq $32, %zmm1, %zmm1
   1658 ; CHECK-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0
   1659 ; CHECK-NEXT:    ret{{[l|q]}}
   1660   %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1661   %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1662   %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1663   %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1664   %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
   1665   ret <8 x i64> %tmp4
   1666 }
   1667 
   1668 define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
   1669 ; X86-LABEL: test_mm512_maskz_mul_epi32:
   1670 ; X86:       # %bb.0: # %entry
   1671 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1672 ; X86-NEXT:    kmovw %eax, %k1
   1673 ; X86-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
   1674 ; X86-NEXT:    retl
   1675 ;
   1676 ; X64-LABEL: test_mm512_maskz_mul_epi32:
   1677 ; X64:       # %bb.0: # %entry
   1678 ; X64-NEXT:    kmovw %edi, %k1
   1679 ; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
   1680 ; X64-NEXT:    retq
   1681 entry:
   1682   %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1683   %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1684   %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1685   %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1686   %4 = mul nsw <8 x i64> %3, %1
   1687   %5 = bitcast i8 %__k to <8 x i1>
   1688   %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
   1689   ret <8 x i64> %6
   1690 }
   1691 
   1692 define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
   1693 ; X86-LABEL: test_mm512_mask_mul_epi32:
   1694 ; X86:       # %bb.0: # %entry
   1695 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1696 ; X86-NEXT:    kmovw %eax, %k1
   1697 ; X86-NEXT:    vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
   1698 ; X86-NEXT:    vmovdqa64 %zmm2, %zmm0
   1699 ; X86-NEXT:    retl
   1700 ;
   1701 ; X64-LABEL: test_mm512_mask_mul_epi32:
   1702 ; X64:       # %bb.0: # %entry
   1703 ; X64-NEXT:    kmovw %edi, %k1
   1704 ; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
   1705 ; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
   1706 ; X64-NEXT:    retq
   1707 entry:
   1708   %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1709   %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1710   %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1711   %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   1712   %4 = mul nsw <8 x i64> %3, %1
   1713   %5 = bitcast i8 %__k to <8 x i1>
   1714   %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src
   1715   ret <8 x i64> %6
   1716 }
   1717 
   1718 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
   1719 ; CHECK-LABEL: test_mm512_mul_epu32:
   1720 ; CHECK:       # %bb.0:
   1721 ; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
   1722 ; CHECK-NEXT:    kmovw %eax, %k0
   1723 ; CHECK-NEXT:    knotw %k0, %k1
   1724 ; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
   1725 ; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k1} {z}
   1726 ; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
   1727 ; CHECK-NEXT:    ret{{[l|q]}}
   1728   %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   1729   %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   1730   %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
   1731   ret <8 x i64> %tmp2
   1732 }
   1733 
   1734 define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
   1735 ; X86-LABEL: test_mm512_maskz_mul_epu32:
   1736 ; X86:       # %bb.0: # %entry
   1737 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1738 ; X86-NEXT:    kmovw %eax, %k1
   1739 ; X86-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
   1740 ; X86-NEXT:    retl
   1741 ;
   1742 ; X64-LABEL: test_mm512_maskz_mul_epu32:
   1743 ; X64:       # %bb.0: # %entry
   1744 ; X64-NEXT:    kmovw %edi, %k1
   1745 ; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
   1746 ; X64-NEXT:    retq
   1747 entry:
   1748   %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   1749   %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   1750   %2 = mul nuw <8 x i64> %1, %0
   1751   %3 = bitcast i8 %__k to <8 x i1>
   1752   %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
   1753   ret <8 x i64> %4
   1754 }
   1755 
   1756 define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
   1757 ; X86-LABEL: test_mm512_mask_mul_epu32:
   1758 ; X86:       # %bb.0: # %entry
   1759 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1760 ; X86-NEXT:    kmovw %eax, %k1
   1761 ; X86-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
   1762 ; X86-NEXT:    vmovdqa64 %zmm2, %zmm0
   1763 ; X86-NEXT:    retl
   1764 ;
   1765 ; X64-LABEL: test_mm512_mask_mul_epu32:
   1766 ; X64:       # %bb.0: # %entry
   1767 ; X64-NEXT:    kmovw %edi, %k1
   1768 ; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
   1769 ; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
   1770 ; X64-NEXT:    retq
   1771 entry:
   1772   %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   1773   %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   1774   %2 = mul nuw <8 x i64> %1, %0
   1775   %3 = bitcast i8 %__k to <8 x i1>
   1776   %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src
   1777   ret <8 x i64> %4
   1778 }
   1779 
   1780 define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
   1781 ; X86-LABEL: test_mm512_set1_epi8:
   1782 ; X86:       # %bb.0: # %entry
   1783 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1784 ; X86-NEXT:    vmovd %eax, %xmm0
   1785 ; X86-NEXT:    vpbroadcastb %xmm0, %ymm0
   1786 ; X86-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
   1787 ; X86-NEXT:    retl
   1788 ;
   1789 ; X64-LABEL: test_mm512_set1_epi8:
   1790 ; X64:       # %bb.0: # %entry
   1791 ; X64-NEXT:    vmovd %edi, %xmm0
   1792 ; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
   1793 ; X64-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
   1794 ; X64-NEXT:    retq
   1795 entry:
   1796   %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
   1797   %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
   1798   %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
   1799   ret <8 x double> %0
   1800 }
   1801 
   1802 define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
   1803 ; X86-LABEL: test_mm_cvtu32_sd:
   1804 ; X86:       # %bb.0: # %entry
   1805 ; X86-NEXT:    vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
   1806 ; X86-NEXT:    retl
   1807 ;
   1808 ; X64-LABEL: test_mm_cvtu32_sd:
   1809 ; X64:       # %bb.0: # %entry
   1810 ; X64-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
   1811 ; X64-NEXT:    retq
   1812 entry:
   1813   %conv.i = uitofp i32 %__B to double
   1814   %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
   1815   ret <2 x double> %vecins.i
   1816 }
   1817 
   1818 define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
   1819 ; X86-LABEL: test_mm_cvtu64_sd:
   1820 ; X86:       # %bb.0: # %entry
   1821 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1822 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1823 ; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
   1824 ; X86-NEXT:    vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
   1825 ; X86-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
   1826 ; X86-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1827 ; X86-NEXT:    retl
   1828 ;
   1829 ; X64-LABEL: test_mm_cvtu64_sd:
   1830 ; X64:       # %bb.0: # %entry
   1831 ; X64-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0
   1832 ; X64-NEXT:    retq
   1833 entry:
   1834   %conv.i = uitofp i64 %__B to double
   1835   %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
   1836   ret <2 x double> %vecins.i
   1837 }
   1838 
   1839 define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
   1840 ; X86-LABEL: test_mm_cvtu32_ss:
   1841 ; X86:       # %bb.0: # %entry
   1842 ; X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
   1843 ; X86-NEXT:    retl
   1844 ;
   1845 ; X64-LABEL: test_mm_cvtu32_ss:
   1846 ; X64:       # %bb.0: # %entry
   1847 ; X64-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
   1848 ; X64-NEXT:    retq
   1849 entry:
   1850   %conv.i = uitofp i32 %__B to float
   1851   %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
   1852   ret <4 x float> %vecins.i
   1853 }
   1854 
   1855 define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
   1856 ; X86-LABEL: test_mm_cvtu64_ss:
   1857 ; X86:       # %bb.0: # %entry
   1858 ; X86-NEXT:    pushl %ebp
   1859 ; X86-NEXT:    .cfi_def_cfa_offset 8
   1860 ; X86-NEXT:    .cfi_offset %ebp, -8
   1861 ; X86-NEXT:    movl %esp, %ebp
   1862 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   1863 ; X86-NEXT:    andl $-8, %esp
   1864 ; X86-NEXT:    subl $16, %esp
   1865 ; X86-NEXT:    movl 12(%ebp), %eax
   1866 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1867 ; X86-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
   1868 ; X86-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
   1869 ; X86-NEXT:    xorl %ecx, %ecx
   1870 ; X86-NEXT:    testl %eax, %eax
   1871 ; X86-NEXT:    setns %cl
   1872 ; X86-NEXT:    fildll {{[0-9]+}}(%esp)
   1873 ; X86-NEXT:    fadds {{\.LCPI.*}}(,%ecx,4)
   1874 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
   1875 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1876 ; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1877 ; X86-NEXT:    movl %ebp, %esp
   1878 ; X86-NEXT:    popl %ebp
   1879 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   1880 ; X86-NEXT:    retl
   1881 ;
   1882 ; X64-LABEL: test_mm_cvtu64_ss:
   1883 ; X64:       # %bb.0: # %entry
   1884 ; X64-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0
   1885 ; X64-NEXT:    retq
   1886 entry:
   1887   %conv.i = uitofp i64 %__B to float
   1888   %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
   1889   ret <4 x float> %vecins.i
   1890 }
   1891 
   1892 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
   1893 ; CHECK-LABEL: test_mm512_cvtps_pd:
   1894 ; CHECK:       # %bb.0: # %entry
   1895 ; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
   1896 ; CHECK-NEXT:    ret{{[l|q]}}
   1897 entry:
   1898   %conv.i = fpext <8 x float> %__A to <8 x double>
   1899   ret <8 x double> %conv.i
   1900 }
   1901 
   1902 define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
   1903 ; CHECK-LABEL: test_mm512_cvtpslo_pd:
   1904 ; CHECK:       # %bb.0: # %entry
   1905 ; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
   1906 ; CHECK-NEXT:    ret{{[l|q]}}
   1907 entry:
   1908   %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1909   %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
   1910   ret <8 x double> %conv.i.i
   1911 }
   1912 
   1913 define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
   1914 ; X86-LABEL: test_mm512_mask_cvtps_pd:
   1915 ; X86:       # %bb.0: # %entry
   1916 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1917 ; X86-NEXT:    kmovw %eax, %k1
   1918 ; X86-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
   1919 ; X86-NEXT:    retl
   1920 ;
   1921 ; X64-LABEL: test_mm512_mask_cvtps_pd:
   1922 ; X64:       # %bb.0: # %entry
   1923 ; X64-NEXT:    kmovw %edi, %k1
   1924 ; X64-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
   1925 ; X64-NEXT:    retq
   1926 entry:
   1927   %conv.i.i = fpext <8 x float> %__A to <8 x double>
   1928   %0 = bitcast i8 %__U to <8 x i1>
   1929   %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
   1930   ret <8 x double> %1
   1931 }
   1932 
   1933 define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
   1934 ; X86-LABEL: test_mm512_mask_cvtpslo_pd:
   1935 ; X86:       # %bb.0: # %entry
   1936 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1937 ; X86-NEXT:    kmovw %eax, %k1
   1938 ; X86-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
   1939 ; X86-NEXT:    retl
   1940 ;
   1941 ; X64-LABEL: test_mm512_mask_cvtpslo_pd:
   1942 ; X64:       # %bb.0: # %entry
   1943 ; X64-NEXT:    kmovw %edi, %k1
   1944 ; X64-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
   1945 ; X64-NEXT:    retq
   1946 entry:
   1947   %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1948   %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
   1949   %0 = bitcast i8 %__U to <8 x i1>
   1950   %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
   1951   ret <8 x double> %1
   1952 }
   1953 
   1954 define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
   1955 ; X86-LABEL: test_mm512_maskz_cvtps_pd:
   1956 ; X86:       # %bb.0: # %entry
   1957 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   1958 ; X86-NEXT:    kmovw %eax, %k1
   1959 ; X86-NEXT:    vcvtps2pd %ymm0, %zmm0 {%k1} {z}
   1960 ; X86-NEXT:    retl
   1961 ;
   1962 ; X64-LABEL: test_mm512_maskz_cvtps_pd:
   1963 ; X64:       # %bb.0: # %entry
   1964 ; X64-NEXT:    kmovw %edi, %k1
   1965 ; X64-NEXT:    vcvtps2pd %ymm0, %zmm0 {%k1} {z}
   1966 ; X64-NEXT:    retq
   1967 entry:
   1968   %conv.i.i = fpext <8 x float> %__A to <8 x double>
   1969   %0 = bitcast i8 %__U to <8 x i1>
   1970   %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
   1971   ret <8 x double> %1
   1972 }
   1973 
   1974 define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
   1975 ; CHECK-LABEL: test_mm512_cvtepi32_epi8:
   1976 ; CHECK:       # %bb.0: # %entry
   1977 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
   1978 ; CHECK-NEXT:    vzeroupper
   1979 ; CHECK-NEXT:    ret{{[l|q]}}
   1980 entry:
   1981   %0 = bitcast <8 x i64> %__A to <16 x i32>
   1982   %conv.i = trunc <16 x i32> %0 to <16 x i8>
   1983   %1 = bitcast <16 x i8> %conv.i to <2 x i64>
   1984   ret <2 x i64> %1
   1985 }
   1986 
   1987 define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
   1988 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
   1989 ; X86:       # %bb.0: # %entry
   1990 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   1991 ; X86-NEXT:    vpmovdb %zmm1, %xmm0 {%k1}
   1992 ; X86-NEXT:    vzeroupper
   1993 ; X86-NEXT:    retl
   1994 ;
   1995 ; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
   1996 ; X64:       # %bb.0: # %entry
   1997 ; X64-NEXT:    kmovw %edi, %k1
   1998 ; X64-NEXT:    vpmovdb %zmm1, %xmm0 {%k1}
   1999 ; X64-NEXT:    vzeroupper
   2000 ; X64-NEXT:    retq
   2001 entry:
   2002   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2003   %1 = bitcast <2 x i64> %__O to <16 x i8>
   2004   %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
   2005   %3 = bitcast <16 x i8> %2 to <2 x i64>
   2006   ret <2 x i64> %3
   2007 }
   2008 
   2009 define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
   2010 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
   2011 ; X86:       # %bb.0: # %entry
   2012 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2013 ; X86-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
   2014 ; X86-NEXT:    vzeroupper
   2015 ; X86-NEXT:    retl
   2016 ;
   2017 ; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
   2018 ; X64:       # %bb.0: # %entry
   2019 ; X64-NEXT:    kmovw %edi, %k1
   2020 ; X64-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
   2021 ; X64-NEXT:    vzeroupper
   2022 ; X64-NEXT:    retq
   2023 entry:
   2024   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2025   %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
   2026   %2 = bitcast <16 x i8> %1 to <2 x i64>
   2027   ret <2 x i64> %2
   2028 }
   2029 
   2030 define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
   2031 ; CHECK-LABEL: test_mm512_cvtepi64_epi32:
   2032 ; CHECK:       # %bb.0: # %entry
   2033 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
   2034 ; CHECK-NEXT:    ret{{[l|q]}}
   2035 entry:
   2036   %conv.i = trunc <8 x i64> %__A to <8 x i32>
   2037   %0 = bitcast <8 x i32> %conv.i to <4 x i64>
   2038   ret <4 x i64> %0
   2039 }
   2040 
   2041 define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
   2042 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
   2043 ; X86:       # %bb.0: # %entry
   2044 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2045 ; X86-NEXT:    kmovw %eax, %k1
   2046 ; X86-NEXT:    vpmovqd %zmm1, %ymm0 {%k1}
   2047 ; X86-NEXT:    retl
   2048 ;
   2049 ; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
   2050 ; X64:       # %bb.0: # %entry
   2051 ; X64-NEXT:    kmovw %edi, %k1
   2052 ; X64-NEXT:    vpmovqd %zmm1, %ymm0 {%k1}
   2053 ; X64-NEXT:    retq
   2054 entry:
   2055   %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
   2056   %0 = bitcast <4 x i64> %__O to <8 x i32>
   2057   %1 = bitcast i8 %__M to <8 x i1>
   2058   %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
   2059   %3 = bitcast <8 x i32> %2 to <4 x i64>
   2060   ret <4 x i64> %3
   2061 }
   2062 
   2063 define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
   2064 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
   2065 ; X86:       # %bb.0: # %entry
   2066 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2067 ; X86-NEXT:    kmovw %eax, %k1
   2068 ; X86-NEXT:    vpmovqd %zmm0, %ymm0 {%k1} {z}
   2069 ; X86-NEXT:    retl
   2070 ;
   2071 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
   2072 ; X64:       # %bb.0: # %entry
   2073 ; X64-NEXT:    kmovw %edi, %k1
   2074 ; X64-NEXT:    vpmovqd %zmm0, %ymm0 {%k1} {z}
   2075 ; X64-NEXT:    retq
   2076 entry:
   2077   %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
   2078   %0 = bitcast i8 %__M to <8 x i1>
   2079   %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
   2080   %2 = bitcast <8 x i32> %1 to <4 x i64>
   2081   ret <4 x i64> %2
   2082 }
   2083 
   2084 define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
   2085 ; CHECK-LABEL: test_mm512_cvtepi64_epi16:
   2086 ; CHECK:       # %bb.0: # %entry
   2087 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
   2088 ; CHECK-NEXT:    vzeroupper
   2089 ; CHECK-NEXT:    ret{{[l|q]}}
   2090 entry:
   2091   %conv.i = trunc <8 x i64> %__A to <8 x i16>
   2092   %0 = bitcast <8 x i16> %conv.i to <2 x i64>
   2093   ret <2 x i64> %0
   2094 }
   2095 
   2096 define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
   2097 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
   2098 ; X86:       # %bb.0: # %entry
   2099 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2100 ; X86-NEXT:    kmovw %eax, %k1
   2101 ; X86-NEXT:    vpmovqw %zmm1, %xmm0 {%k1}
   2102 ; X86-NEXT:    vzeroupper
   2103 ; X86-NEXT:    retl
   2104 ;
   2105 ; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
   2106 ; X64:       # %bb.0: # %entry
   2107 ; X64-NEXT:    kmovw %edi, %k1
   2108 ; X64-NEXT:    vpmovqw %zmm1, %xmm0 {%k1}
   2109 ; X64-NEXT:    vzeroupper
   2110 ; X64-NEXT:    retq
   2111 entry:
   2112   %0 = bitcast <2 x i64> %__O to <8 x i16>
   2113   %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
   2114   %2 = bitcast <8 x i16> %1 to <2 x i64>
   2115   ret <2 x i64> %2
   2116 }
   2117 
   2118 define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
   2119 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
   2120 ; X86:       # %bb.0: # %entry
   2121 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2122 ; X86-NEXT:    kmovw %eax, %k1
   2123 ; X86-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
   2124 ; X86-NEXT:    vzeroupper
   2125 ; X86-NEXT:    retl
   2126 ;
   2127 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
   2128 ; X64:       # %bb.0: # %entry
   2129 ; X64-NEXT:    kmovw %edi, %k1
   2130 ; X64-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
   2131 ; X64-NEXT:    vzeroupper
   2132 ; X64-NEXT:    retq
   2133 entry:
   2134   %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
   2135   %1 = bitcast <8 x i16> %0 to <2 x i64>
   2136   ret <2 x i64> %1
   2137 }
   2138 
   2139 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
   2140 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
   2141 
   2142 define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
   2143 ; CHECK-LABEL: test_mm512_ternarylogic_epi32:
   2144 ; CHECK:       # %bb.0: # %entry
   2145 ; CHECK-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0
   2146 ; CHECK-NEXT:    ret{{[l|q]}}
   2147 entry:
   2148   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2149   %1 = bitcast <8 x i64> %__B to <16 x i32>
   2150   %2 = bitcast <8 x i64> %__C to <16 x i32>
   2151   %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
   2152   %4 = bitcast <16 x i32> %3 to <8 x i64>
   2153   ret <8 x i64> %4
   2154 }
   2155 
   2156 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
   2157 
   2158 define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
   2159 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
   2160 ; X86:       # %bb.0: # %entry
   2161 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2162 ; X86-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
   2163 ; X86-NEXT:    retl
   2164 ;
   2165 ; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
   2166 ; X64:       # %bb.0: # %entry
   2167 ; X64-NEXT:    kmovw %edi, %k1
   2168 ; X64-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
   2169 ; X64-NEXT:    retq
   2170 entry:
   2171   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2172   %1 = bitcast <8 x i64> %__B to <16 x i32>
   2173   %2 = bitcast <8 x i64> %__C to <16 x i32>
   2174   %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
   2175   %4 = bitcast i16 %__U to <16 x i1>
   2176   %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
   2177   %6 = bitcast <16 x i32> %5 to <8 x i64>
   2178   ret <8 x i64> %6
   2179 }
   2180 
   2181 define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
   2182 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
   2183 ; X86:       # %bb.0: # %entry
   2184 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2185 ; X86-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
   2186 ; X86-NEXT:    retl
   2187 ;
   2188 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
   2189 ; X64:       # %bb.0: # %entry
   2190 ; X64-NEXT:    kmovw %edi, %k1
   2191 ; X64-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
   2192 ; X64-NEXT:    retq
   2193 entry:
   2194   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2195   %1 = bitcast <8 x i64> %__B to <16 x i32>
   2196   %2 = bitcast <8 x i64> %__C to <16 x i32>
   2197   %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
   2198   %4 = bitcast i16 %__U to <16 x i1>
   2199   %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
   2200   %6 = bitcast <16 x i32> %5 to <8 x i64>
   2201   ret <8 x i64> %6
   2202 }
   2203 
   2204 define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
   2205 ; CHECK-LABEL: test_mm512_ternarylogic_epi64:
   2206 ; CHECK:       # %bb.0: # %entry
   2207 ; CHECK-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0
   2208 ; CHECK-NEXT:    ret{{[l|q]}}
   2209 entry:
   2210   %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
   2211   ret <8 x i64> %0
   2212 }
   2213 
   2214 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
   2215 
   2216 define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
   2217 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
   2218 ; X86:       # %bb.0: # %entry
   2219 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2220 ; X86-NEXT:    kmovw %eax, %k1
   2221 ; X86-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
   2222 ; X86-NEXT:    retl
   2223 ;
   2224 ; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
   2225 ; X64:       # %bb.0: # %entry
   2226 ; X64-NEXT:    kmovw %edi, %k1
   2227 ; X64-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
   2228 ; X64-NEXT:    retq
   2229 entry:
   2230   %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
   2231   %1 = bitcast i8 %__U to <8 x i1>
   2232   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
   2233   ret <8 x i64> %2
   2234 }
   2235 
   2236 define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
   2237 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
   2238 ; X86:       # %bb.0: # %entry
   2239 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2240 ; X86-NEXT:    kmovw %eax, %k1
   2241 ; X86-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
   2242 ; X86-NEXT:    retl
   2243 ;
   2244 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
   2245 ; X64:       # %bb.0: # %entry
   2246 ; X64-NEXT:    kmovw %edi, %k1
   2247 ; X64-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
   2248 ; X64-NEXT:    retq
   2249 entry:
   2250   %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
   2251   %1 = bitcast i8 %__U to <8 x i1>
   2252   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
   2253   ret <8 x i64> %2
   2254 }
   2255 
   2256 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
   2257 
   2258 define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
   2259 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
   2260 ; X86:       # %bb.0: # %entry
   2261 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2262 ; X86-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
   2263 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
   2264 ; X86-NEXT:    retl
   2265 ;
   2266 ; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
   2267 ; X64:       # %bb.0: # %entry
   2268 ; X64-NEXT:    kmovw %edi, %k1
   2269 ; X64-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
   2270 ; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
   2271 ; X64-NEXT:    retq
   2272 entry:
   2273   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2274   %1 = bitcast <8 x i64> %__I to <16 x i32>
   2275   %2 = bitcast <8 x i64> %__B to <16 x i32>
   2276   %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
   2277   %4 = bitcast i16 %__U to <16 x i1>
   2278   %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
   2279   %6 = bitcast <16 x i32> %5 to <8 x i64>
   2280   ret <8 x i64> %6
   2281 }
   2282 
   2283 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
   2284 
   2285 define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
   2286 ; X86-LABEL: test_mm512_mask2_permutex2var_pd:
   2287 ; X86:       # %bb.0: # %entry
   2288 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2289 ; X86-NEXT:    kmovw %eax, %k1
   2290 ; X86-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
   2291 ; X86-NEXT:    vmovapd %zmm1, %zmm0
   2292 ; X86-NEXT:    retl
   2293 ;
   2294 ; X64-LABEL: test_mm512_mask2_permutex2var_pd:
   2295 ; X64:       # %bb.0: # %entry
   2296 ; X64-NEXT:    kmovw %edi, %k1
   2297 ; X64-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
   2298 ; X64-NEXT:    vmovapd %zmm1, %zmm0
   2299 ; X64-NEXT:    retq
   2300 entry:
   2301   %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
   2302   %1 = bitcast <8 x i64> %__I to <8 x double>
   2303   %2 = bitcast i8 %__U to <8 x i1>
   2304   %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
   2305   ret <8 x double> %3
   2306 }
   2307 
   2308 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
   2309 
   2310 define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
   2311 ; X86-LABEL: test_mm512_mask2_permutex2var_ps:
   2312 ; X86:       # %bb.0: # %entry
   2313 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2314 ; X86-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
   2315 ; X86-NEXT:    vmovaps %zmm1, %zmm0
   2316 ; X86-NEXT:    retl
   2317 ;
   2318 ; X64-LABEL: test_mm512_mask2_permutex2var_ps:
   2319 ; X64:       # %bb.0: # %entry
   2320 ; X64-NEXT:    kmovw %edi, %k1
   2321 ; X64-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
   2322 ; X64-NEXT:    vmovaps %zmm1, %zmm0
   2323 ; X64-NEXT:    retq
   2324 entry:
   2325   %0 = bitcast <8 x i64> %__I to <16 x i32>
   2326   %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
   2327   %2 = bitcast <8 x i64> %__I to <16 x float>
   2328   %3 = bitcast i16 %__U to <16 x i1>
   2329   %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
   2330   ret <16 x float> %4
   2331 }
   2332 
   2333 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
   2334 
   2335 define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
   2336 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
   2337 ; X86:       # %bb.0: # %entry
   2338 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2339 ; X86-NEXT:    kmovw %eax, %k1
   2340 ; X86-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
   2341 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
   2342 ; X86-NEXT:    retl
   2343 ;
   2344 ; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
   2345 ; X64:       # %bb.0: # %entry
   2346 ; X64-NEXT:    kmovw %edi, %k1
   2347 ; X64-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
   2348 ; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
   2349 ; X64-NEXT:    retq
   2350 entry:
   2351   %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
   2352   %1 = bitcast i8 %__U to <8 x i1>
   2353   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
   2354   ret <8 x i64> %2
   2355 }
   2356 
   2357 define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
   2358 ; CHECK-LABEL: test_mm512_permutex2var_epi32:
   2359 ; CHECK:       # %bb.0: # %entry
   2360 ; CHECK-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0
   2361 ; CHECK-NEXT:    ret{{[l|q]}}
   2362 entry:
   2363   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2364   %1 = bitcast <8 x i64> %__I to <16 x i32>
   2365   %2 = bitcast <8 x i64> %__B to <16 x i32>
   2366   %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
   2367   %4 = bitcast <16 x i32> %3 to <8 x i64>
   2368   ret <8 x i64> %4
   2369 }
   2370 
   2371 define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
   2372 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
   2373 ; X86:       # %bb.0: # %entry
   2374 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2375 ; X86-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
   2376 ; X86-NEXT:    retl
   2377 ;
   2378 ; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
   2379 ; X64:       # %bb.0: # %entry
   2380 ; X64-NEXT:    kmovw %edi, %k1
   2381 ; X64-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
   2382 ; X64-NEXT:    retq
   2383 entry:
   2384   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2385   %1 = bitcast <8 x i64> %__I to <16 x i32>
   2386   %2 = bitcast <8 x i64> %__B to <16 x i32>
   2387   %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
   2388   %4 = bitcast i16 %__U to <16 x i1>
   2389   %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
   2390   %6 = bitcast <16 x i32> %5 to <8 x i64>
   2391   ret <8 x i64> %6
   2392 }
   2393 
   2394 define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
   2395 ; X86-LABEL: test_mm512_mask_permutex2var_epi32:
   2396 ; X86:       # %bb.0: # %entry
   2397 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2398 ; X86-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
   2399 ; X86-NEXT:    retl
   2400 ;
   2401 ; X64-LABEL: test_mm512_mask_permutex2var_epi32:
   2402 ; X64:       # %bb.0: # %entry
   2403 ; X64-NEXT:    kmovw %edi, %k1
   2404 ; X64-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
   2405 ; X64-NEXT:    retq
   2406 entry:
   2407   %0 = bitcast <8 x i64> %__A to <16 x i32>
   2408   %1 = bitcast <8 x i64> %__I to <16 x i32>
   2409   %2 = bitcast <8 x i64> %__B to <16 x i32>
   2410   %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
   2411   %4 = bitcast i16 %__U to <16 x i1>
   2412   %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
   2413   %6 = bitcast <16 x i32> %5 to <8 x i64>
   2414   ret <8 x i64> %6
   2415 }
   2416 
   2417 define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
   2418 ; CHECK-LABEL: test_mm512_permutex2var_pd:
   2419 ; CHECK:       # %bb.0: # %entry
   2420 ; CHECK-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0
   2421 ; CHECK-NEXT:    ret{{[l|q]}}
   2422 entry:
   2423   %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
   2424   ret <8 x double> %0
   2425 }
   2426 
   2427 define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
   2428 ; X86-LABEL: test_mm512_mask_permutex2var_pd:
   2429 ; X86:       # %bb.0: # %entry
   2430 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2431 ; X86-NEXT:    kmovw %eax, %k1
   2432 ; X86-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
   2433 ; X86-NEXT:    retl
   2434 ;
   2435 ; X64-LABEL: test_mm512_mask_permutex2var_pd:
   2436 ; X64:       # %bb.0: # %entry
   2437 ; X64-NEXT:    kmovw %edi, %k1
   2438 ; X64-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
   2439 ; X64-NEXT:    retq
   2440 entry:
   2441   %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
   2442   %1 = bitcast i8 %__U to <8 x i1>
   2443   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   2444   ret <8 x double> %2
   2445 }
   2446 
   2447 define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
   2448 ; X86-LABEL: test_mm512_maskz_permutex2var_pd:
   2449 ; X86:       # %bb.0: # %entry
   2450 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2451 ; X86-NEXT:    kmovw %eax, %k1
   2452 ; X86-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
   2453 ; X86-NEXT:    retl
   2454 ;
   2455 ; X64-LABEL: test_mm512_maskz_permutex2var_pd:
   2456 ; X64:       # %bb.0: # %entry
   2457 ; X64-NEXT:    kmovw %edi, %k1
   2458 ; X64-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
   2459 ; X64-NEXT:    retq
   2460 entry:
   2461   %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
   2462   %1 = bitcast i8 %__U to <8 x i1>
   2463   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   2464   ret <8 x double> %2
   2465 }
   2466 
   2467 define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
   2468 ; CHECK-LABEL: test_mm512_permutex2var_ps:
   2469 ; CHECK:       # %bb.0: # %entry
   2470 ; CHECK-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
   2471 ; CHECK-NEXT:    ret{{[l|q]}}
   2472 entry:
   2473   %0 = bitcast <8 x i64> %__I to <16 x i32>
   2474   %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
   2475   ret <16 x float> %1
   2476 }
   2477 
   2478 define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
   2479 ; X86-LABEL: test_mm512_mask_permutex2var_ps:
   2480 ; X86:       # %bb.0: # %entry
   2481 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2482 ; X86-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
   2483 ; X86-NEXT:    retl
   2484 ;
   2485 ; X64-LABEL: test_mm512_mask_permutex2var_ps:
   2486 ; X64:       # %bb.0: # %entry
   2487 ; X64-NEXT:    kmovw %edi, %k1
   2488 ; X64-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
   2489 ; X64-NEXT:    retq
   2490 entry:
   2491   %0 = bitcast <8 x i64> %__I to <16 x i32>
   2492   %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
   2493   %2 = bitcast i16 %__U to <16 x i1>
   2494   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
   2495   ret <16 x float> %3
   2496 }
   2497 
   2498 define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
   2499 ; X86-LABEL: test_mm512_maskz_permutex2var_ps:
   2500 ; X86:       # %bb.0: # %entry
   2501 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   2502 ; X86-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
   2503 ; X86-NEXT:    retl
   2504 ;
   2505 ; X64-LABEL: test_mm512_maskz_permutex2var_ps:
   2506 ; X64:       # %bb.0: # %entry
   2507 ; X64-NEXT:    kmovw %edi, %k1
   2508 ; X64-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
   2509 ; X64-NEXT:    retq
   2510 entry:
   2511   %0 = bitcast <8 x i64> %__I to <16 x i32>
   2512   %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
   2513   %2 = bitcast i16 %__U to <16 x i1>
   2514   %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
   2515   ret <16 x float> %3
   2516 }
   2517 
   2518 define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
   2519 ; CHECK-LABEL: test_mm512_permutex2var_epi64:
   2520 ; CHECK:       # %bb.0: # %entry
   2521 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0
   2522 ; CHECK-NEXT:    ret{{[l|q]}}
   2523 entry:
   2524   %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
   2525   ret <8 x i64> %0
   2526 }
   2527 
   2528 define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
   2529 ; X86-LABEL: test_mm512_mask_permutex2var_epi64:
   2530 ; X86:       # %bb.0: # %entry
   2531 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2532 ; X86-NEXT:    kmovw %eax, %k1
   2533 ; X86-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
   2534 ; X86-NEXT:    retl
   2535 ;
   2536 ; X64-LABEL: test_mm512_mask_permutex2var_epi64:
   2537 ; X64:       # %bb.0: # %entry
   2538 ; X64-NEXT:    kmovw %edi, %k1
   2539 ; X64-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
   2540 ; X64-NEXT:    retq
   2541 entry:
   2542   %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
   2543   %1 = bitcast i8 %__U to <8 x i1>
   2544   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
   2545   ret <8 x i64> %2
   2546 }
   2547 
   2548 define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
   2549 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
   2550 ; X86:       # %bb.0: # %entry
   2551 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2552 ; X86-NEXT:    kmovw %eax, %k1
   2553 ; X86-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
   2554 ; X86-NEXT:    retl
   2555 ;
   2556 ; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
   2557 ; X64:       # %bb.0: # %entry
   2558 ; X64-NEXT:    kmovw %edi, %k1
   2559 ; X64-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
   2560 ; X64-NEXT:    retq
   2561 entry:
   2562   %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
   2563   %1 = bitcast i8 %__U to <8 x i1>
   2564   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
   2565   ret <8 x i64> %2
   2566 }
   2567 define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2568 ; X86-LABEL: test_mm_mask_add_ss:
   2569 ; X86:       # %bb.0: # %entry
   2570 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2571 ; X86-NEXT:    kmovw %eax, %k1
   2572 ; X86-NEXT:    vaddss %xmm2, %xmm1, %xmm0 {%k1}
   2573 ; X86-NEXT:    retl
   2574 ;
   2575 ; X64-LABEL: test_mm_mask_add_ss:
   2576 ; X64:       # %bb.0: # %entry
   2577 ; X64-NEXT:    kmovw %edi, %k1
   2578 ; X64-NEXT:    vaddss %xmm2, %xmm1, %xmm0 {%k1}
   2579 ; X64-NEXT:    retq
   2580 entry:
   2581   %vecext.i.i = extractelement <4 x float> %__B, i32 0
   2582   %vecext1.i.i = extractelement <4 x float> %__A, i32 0
   2583   %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
   2584   %0 = and i8 %__U, 1
   2585   %tobool.i = icmp eq i8 %0, 0
   2586   %vecext1.i = extractelement <4 x float> %__W, i32 0
   2587   %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
   2588   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   2589   ret <4 x float> %vecins.i
   2590 }
   2591 
   2592 define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2593 ; X86-LABEL: test_mm_maskz_add_ss:
   2594 ; X86:       # %bb.0: # %entry
   2595 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2596 ; X86-NEXT:    kmovw %eax, %k1
   2597 ; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2598 ; X86-NEXT:    retl
   2599 ;
   2600 ; X64-LABEL: test_mm_maskz_add_ss:
   2601 ; X64:       # %bb.0: # %entry
   2602 ; X64-NEXT:    kmovw %edi, %k1
   2603 ; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2604 ; X64-NEXT:    retq
   2605 entry:
   2606   %vecext.i.i = extractelement <4 x float> %__B, i32 0
   2607   %vecext1.i.i = extractelement <4 x float> %__A, i32 0
   2608   %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
   2609   %0 = and i8 %__U, 1
   2610   %tobool.i = icmp eq i8 %0, 0
   2611   %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
   2612   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   2613   ret <4 x float> %vecins.i
   2614 }
   2615 
   2616 define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2617 ; X86-LABEL: test_mm_mask_add_sd:
   2618 ; X86:       # %bb.0: # %entry
   2619 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2620 ; X86-NEXT:    kmovw %eax, %k1
   2621 ; X86-NEXT:    vaddsd %xmm2, %xmm1, %xmm0 {%k1}
   2622 ; X86-NEXT:    retl
   2623 ;
   2624 ; X64-LABEL: test_mm_mask_add_sd:
   2625 ; X64:       # %bb.0: # %entry
   2626 ; X64-NEXT:    kmovw %edi, %k1
   2627 ; X64-NEXT:    vaddsd %xmm2, %xmm1, %xmm0 {%k1}
   2628 ; X64-NEXT:    retq
   2629 entry:
   2630   %vecext.i.i = extractelement <2 x double> %__B, i32 0
   2631   %vecext1.i.i = extractelement <2 x double> %__A, i32 0
   2632   %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
   2633   %0 = and i8 %__U, 1
   2634   %tobool.i = icmp eq i8 %0, 0
   2635   %vecext1.i = extractelement <2 x double> %__W, i32 0
   2636   %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
   2637   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   2638   ret <2 x double> %vecins.i
   2639 }
   2640 
   2641 define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2642 ; X86-LABEL: test_mm_maskz_add_sd:
   2643 ; X86:       # %bb.0: # %entry
   2644 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2645 ; X86-NEXT:    kmovw %eax, %k1
   2646 ; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2647 ; X86-NEXT:    retl
   2648 ;
   2649 ; X64-LABEL: test_mm_maskz_add_sd:
   2650 ; X64:       # %bb.0: # %entry
   2651 ; X64-NEXT:    kmovw %edi, %k1
   2652 ; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2653 ; X64-NEXT:    retq
   2654 entry:
   2655   %vecext.i.i = extractelement <2 x double> %__B, i32 0
   2656   %vecext1.i.i = extractelement <2 x double> %__A, i32 0
   2657   %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
   2658   %0 = and i8 %__U, 1
   2659   %tobool.i = icmp eq i8 %0, 0
   2660   %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
   2661   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   2662   ret <2 x double> %vecins.i
   2663 }
   2664 
   2665 define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2666 ; X86-LABEL: test_mm_mask_sub_ss:
   2667 ; X86:       # %bb.0: # %entry
   2668 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2669 ; X86-NEXT:    kmovw %eax, %k1
   2670 ; X86-NEXT:    vsubss %xmm2, %xmm1, %xmm0 {%k1}
   2671 ; X86-NEXT:    retl
   2672 ;
   2673 ; X64-LABEL: test_mm_mask_sub_ss:
   2674 ; X64:       # %bb.0: # %entry
   2675 ; X64-NEXT:    kmovw %edi, %k1
   2676 ; X64-NEXT:    vsubss %xmm2, %xmm1, %xmm0 {%k1}
   2677 ; X64-NEXT:    retq
   2678 entry:
   2679   %vecext.i.i = extractelement <4 x float> %__B, i32 0
   2680   %vecext1.i.i = extractelement <4 x float> %__A, i32 0
   2681   %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
   2682   %0 = and i8 %__U, 1
   2683   %tobool.i = icmp eq i8 %0, 0
   2684   %vecext1.i = extractelement <4 x float> %__W, i32 0
   2685   %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
   2686   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   2687   ret <4 x float> %vecins.i
   2688 }
   2689 
   2690 define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2691 ; X86-LABEL: test_mm_maskz_sub_ss:
   2692 ; X86:       # %bb.0: # %entry
   2693 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2694 ; X86-NEXT:    kmovw %eax, %k1
   2695 ; X86-NEXT:    vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2696 ; X86-NEXT:    retl
   2697 ;
   2698 ; X64-LABEL: test_mm_maskz_sub_ss:
   2699 ; X64:       # %bb.0: # %entry
   2700 ; X64-NEXT:    kmovw %edi, %k1
   2701 ; X64-NEXT:    vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2702 ; X64-NEXT:    retq
   2703 entry:
   2704   %vecext.i.i = extractelement <4 x float> %__B, i32 0
   2705   %vecext1.i.i = extractelement <4 x float> %__A, i32 0
   2706   %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
   2707   %0 = and i8 %__U, 1
   2708   %tobool.i = icmp eq i8 %0, 0
   2709   %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
   2710   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   2711   ret <4 x float> %vecins.i
   2712 }
   2713 
   2714 define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2715 ; X86-LABEL: test_mm_mask_sub_sd:
   2716 ; X86:       # %bb.0: # %entry
   2717 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2718 ; X86-NEXT:    kmovw %eax, %k1
   2719 ; X86-NEXT:    vsubsd %xmm2, %xmm1, %xmm0 {%k1}
   2720 ; X86-NEXT:    retl
   2721 ;
   2722 ; X64-LABEL: test_mm_mask_sub_sd:
   2723 ; X64:       # %bb.0: # %entry
   2724 ; X64-NEXT:    kmovw %edi, %k1
   2725 ; X64-NEXT:    vsubsd %xmm2, %xmm1, %xmm0 {%k1}
   2726 ; X64-NEXT:    retq
   2727 entry:
   2728   %vecext.i.i = extractelement <2 x double> %__B, i32 0
   2729   %vecext1.i.i = extractelement <2 x double> %__A, i32 0
   2730   %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
   2731   %0 = and i8 %__U, 1
   2732   %tobool.i = icmp eq i8 %0, 0
   2733   %vecext1.i = extractelement <2 x double> %__W, i32 0
   2734   %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
   2735   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   2736   ret <2 x double> %vecins.i
   2737 }
   2738 
   2739 define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2740 ; X86-LABEL: test_mm_maskz_sub_sd:
   2741 ; X86:       # %bb.0: # %entry
   2742 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2743 ; X86-NEXT:    kmovw %eax, %k1
   2744 ; X86-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2745 ; X86-NEXT:    retl
   2746 ;
   2747 ; X64-LABEL: test_mm_maskz_sub_sd:
   2748 ; X64:       # %bb.0: # %entry
   2749 ; X64-NEXT:    kmovw %edi, %k1
   2750 ; X64-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2751 ; X64-NEXT:    retq
   2752 entry:
   2753   %vecext.i.i = extractelement <2 x double> %__B, i32 0
   2754   %vecext1.i.i = extractelement <2 x double> %__A, i32 0
   2755   %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
   2756   %0 = and i8 %__U, 1
   2757   %tobool.i = icmp eq i8 %0, 0
   2758   %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
   2759   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   2760   ret <2 x double> %vecins.i
   2761 }
   2762 
   2763 define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2764 ; X86-LABEL: test_mm_mask_mul_ss:
   2765 ; X86:       # %bb.0: # %entry
   2766 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2767 ; X86-NEXT:    kmovw %eax, %k1
   2768 ; X86-NEXT:    vmulss %xmm2, %xmm1, %xmm0 {%k1}
   2769 ; X86-NEXT:    retl
   2770 ;
   2771 ; X64-LABEL: test_mm_mask_mul_ss:
   2772 ; X64:       # %bb.0: # %entry
   2773 ; X64-NEXT:    kmovw %edi, %k1
   2774 ; X64-NEXT:    vmulss %xmm2, %xmm1, %xmm0 {%k1}
   2775 ; X64-NEXT:    retq
   2776 entry:
   2777   %vecext.i.i = extractelement <4 x float> %__B, i32 0
   2778   %vecext1.i.i = extractelement <4 x float> %__A, i32 0
   2779   %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
   2780   %0 = and i8 %__U, 1
   2781   %tobool.i = icmp eq i8 %0, 0
   2782   %vecext1.i = extractelement <4 x float> %__W, i32 0
   2783   %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
   2784   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   2785   ret <4 x float> %vecins.i
   2786 }
   2787 
   2788 define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2789 ; X86-LABEL: test_mm_maskz_mul_ss:
   2790 ; X86:       # %bb.0: # %entry
   2791 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2792 ; X86-NEXT:    kmovw %eax, %k1
   2793 ; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2794 ; X86-NEXT:    retl
   2795 ;
   2796 ; X64-LABEL: test_mm_maskz_mul_ss:
   2797 ; X64:       # %bb.0: # %entry
   2798 ; X64-NEXT:    kmovw %edi, %k1
   2799 ; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2800 ; X64-NEXT:    retq
   2801 entry:
   2802   %vecext.i.i = extractelement <4 x float> %__B, i32 0
   2803   %vecext1.i.i = extractelement <4 x float> %__A, i32 0
   2804   %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
   2805   %0 = and i8 %__U, 1
   2806   %tobool.i = icmp eq i8 %0, 0
   2807   %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
   2808   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   2809   ret <4 x float> %vecins.i
   2810 }
   2811 
   2812 define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2813 ; X86-LABEL: test_mm_mask_mul_sd:
   2814 ; X86:       # %bb.0: # %entry
   2815 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2816 ; X86-NEXT:    kmovw %eax, %k1
   2817 ; X86-NEXT:    vmulsd %xmm2, %xmm1, %xmm0 {%k1}
   2818 ; X86-NEXT:    retl
   2819 ;
   2820 ; X64-LABEL: test_mm_mask_mul_sd:
   2821 ; X64:       # %bb.0: # %entry
   2822 ; X64-NEXT:    kmovw %edi, %k1
   2823 ; X64-NEXT:    vmulsd %xmm2, %xmm1, %xmm0 {%k1}
   2824 ; X64-NEXT:    retq
   2825 entry:
   2826   %vecext.i.i = extractelement <2 x double> %__B, i32 0
   2827   %vecext1.i.i = extractelement <2 x double> %__A, i32 0
   2828   %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
   2829   %0 = and i8 %__U, 1
   2830   %tobool.i = icmp eq i8 %0, 0
   2831   %vecext1.i = extractelement <2 x double> %__W, i32 0
   2832   %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
   2833   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   2834   ret <2 x double> %vecins.i
   2835 }
   2836 
   2837 define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2838 ; X86-LABEL: test_mm_maskz_mul_sd:
   2839 ; X86:       # %bb.0: # %entry
   2840 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2841 ; X86-NEXT:    kmovw %eax, %k1
   2842 ; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2843 ; X86-NEXT:    retl
   2844 ;
   2845 ; X64-LABEL: test_mm_maskz_mul_sd:
   2846 ; X64:       # %bb.0: # %entry
   2847 ; X64-NEXT:    kmovw %edi, %k1
   2848 ; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2849 ; X64-NEXT:    retq
   2850 entry:
   2851   %vecext.i.i = extractelement <2 x double> %__B, i32 0
   2852   %vecext1.i.i = extractelement <2 x double> %__A, i32 0
   2853   %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
   2854   %0 = and i8 %__U, 1
   2855   %tobool.i = icmp eq i8 %0, 0
   2856   %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
   2857   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   2858   ret <2 x double> %vecins.i
   2859 }
   2860 
   2861 define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2862 ; X86-LABEL: test_mm_mask_div_ss:
   2863 ; X86:       # %bb.0: # %entry
   2864 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2865 ; X86-NEXT:    kmovw %eax, %k1
   2866 ; X86-NEXT:    vdivss %xmm2, %xmm1, %xmm0 {%k1}
   2867 ; X86-NEXT:    retl
   2868 ;
   2869 ; X64-LABEL: test_mm_mask_div_ss:
   2870 ; X64:       # %bb.0: # %entry
   2871 ; X64-NEXT:    kmovw %edi, %k1
   2872 ; X64-NEXT:    vdivss %xmm2, %xmm1, %xmm0 {%k1}
   2873 ; X64-NEXT:    retq
   2874 entry:
   2875   %0 = extractelement <4 x float> %__A, i64 0
   2876   %1 = extractelement <4 x float> %__B, i64 0
   2877   %2 = extractelement <4 x float> %__W, i64 0
   2878   %3 = fdiv float %0, %1
   2879   %4 = bitcast i8 %__U to <8 x i1>
   2880   %5 = extractelement <8 x i1> %4, i64 0
   2881   %6 = select i1 %5, float %3, float %2
   2882   %7 = insertelement <4 x float> %__A, float %6, i64 0
   2883   ret <4 x float> %7
   2884 }
   2885 
   2886 define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   2887 ; X86-LABEL: test_mm_maskz_div_ss:
   2888 ; X86:       # %bb.0: # %entry
   2889 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2890 ; X86-NEXT:    kmovw %eax, %k1
   2891 ; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2892 ; X86-NEXT:    retl
   2893 ;
   2894 ; X64-LABEL: test_mm_maskz_div_ss:
   2895 ; X64:       # %bb.0: # %entry
   2896 ; X64-NEXT:    kmovw %edi, %k1
   2897 ; X64-NEXT:    vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
   2898 ; X64-NEXT:    retq
   2899 entry:
   2900   %0 = extractelement <4 x float> %__A, i64 0
   2901   %1 = extractelement <4 x float> %__B, i64 0
   2902   %2 = fdiv float %0, %1
   2903   %3 = bitcast i8 %__U to <8 x i1>
   2904   %4 = extractelement <8 x i1> %3, i64 0
   2905   %5 = select i1 %4, float %2, float 0.000000e+00
   2906   %6 = insertelement <4 x float> %__A, float %5, i64 0
   2907   ret <4 x float> %6
   2908 }
   2909 
   2910 define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2911 ; X86-LABEL: test_mm_mask_div_sd:
   2912 ; X86:       # %bb.0: # %entry
   2913 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2914 ; X86-NEXT:    kmovw %eax, %k1
   2915 ; X86-NEXT:    vdivsd %xmm2, %xmm1, %xmm0 {%k1}
   2916 ; X86-NEXT:    retl
   2917 ;
   2918 ; X64-LABEL: test_mm_mask_div_sd:
   2919 ; X64:       # %bb.0: # %entry
   2920 ; X64-NEXT:    kmovw %edi, %k1
   2921 ; X64-NEXT:    vdivsd %xmm2, %xmm1, %xmm0 {%k1}
   2922 ; X64-NEXT:    retq
   2923 entry:
   2924   %0 = extractelement <2 x double> %__A, i64 0
   2925   %1 = extractelement <2 x double> %__B, i64 0
   2926   %2 = extractelement <2 x double> %__W, i64 0
   2927   %3 = fdiv double %0, %1
   2928   %4 = bitcast i8 %__U to <8 x i1>
   2929   %5 = extractelement <8 x i1> %4, i64 0
   2930   %6 = select i1 %5, double %3, double %2
   2931   %7 = insertelement <2 x double> %__A, double %6, i64 0
   2932   ret <2 x double> %7
   2933 }
   2934 
   2935 define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   2936 ; X86-LABEL: test_mm_maskz_div_sd:
   2937 ; X86:       # %bb.0: # %entry
   2938 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2939 ; X86-NEXT:    kmovw %eax, %k1
   2940 ; X86-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2941 ; X86-NEXT:    retl
   2942 ;
   2943 ; X64-LABEL: test_mm_maskz_div_sd:
   2944 ; X64:       # %bb.0: # %entry
   2945 ; X64-NEXT:    kmovw %edi, %k1
   2946 ; X64-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
   2947 ; X64-NEXT:    retq
   2948 entry:
   2949   %0 = extractelement <2 x double> %__A, i64 0
   2950   %1 = extractelement <2 x double> %__B, i64 0
   2951   %2 = fdiv double %0, %1
   2952   %3 = bitcast i8 %__U to <8 x i1>
   2953   %4 = extractelement <8 x i1> %3, i64 0
   2954   %5 = select i1 %4, double %2, double 0.000000e+00
   2955   %6 = insertelement <2 x double> %__A, double %5, i64 0
   2956   ret <2 x double> %6
   2957 }
   2958 
   2959 
   2960 define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   2961 ; CHECK-LABEL: test_mm512_fmadd_round_pd:
   2962 ; CHECK:       # %bb.0: # %entry
   2963 ; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
   2964 ; CHECK-NEXT:    ret{{[l|q]}}
   2965 entry:
   2966   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
   2967   ret <8 x double> %0
   2968 }
   2969 
   2970 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
   2971 
   2972 define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   2973 ; X86-LABEL: test_mm512_mask_fmadd_round_pd:
   2974 ; X86:       # %bb.0: # %entry
   2975 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2976 ; X86-NEXT:    kmovw %eax, %k1
   2977 ; X86-NEXT:    vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   2978 ; X86-NEXT:    retl
   2979 ;
   2980 ; X64-LABEL: test_mm512_mask_fmadd_round_pd:
   2981 ; X64:       # %bb.0: # %entry
   2982 ; X64-NEXT:    kmovw %edi, %k1
   2983 ; X64-NEXT:    vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   2984 ; X64-NEXT:    retq
   2985 entry:
   2986   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
   2987   %1 = bitcast i8 %__U to <8 x i1>
   2988   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   2989   ret <8 x double> %2
   2990 }
   2991 
   2992 define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   2993 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
   2994 ; X86:       # %bb.0: # %entry
   2995 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   2996 ; X86-NEXT:    kmovw %eax, %k1
   2997 ; X86-NEXT:    vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   2998 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   2999 ; X86-NEXT:    retl
   3000 ;
   3001 ; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
   3002 ; X64:       # %bb.0: # %entry
   3003 ; X64-NEXT:    kmovw %edi, %k1
   3004 ; X64-NEXT:    vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3005 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   3006 ; X64-NEXT:    retq
   3007 entry:
   3008   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
   3009   %1 = bitcast i8 %__U to <8 x i1>
   3010   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   3011   ret <8 x double> %2
   3012 }
   3013 
   3014 define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3015 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
   3016 ; X86:       # %bb.0: # %entry
   3017 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3018 ; X86-NEXT:    kmovw %eax, %k1
   3019 ; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3020 ; X86-NEXT:    retl
   3021 ;
   3022 ; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
   3023 ; X64:       # %bb.0: # %entry
   3024 ; X64-NEXT:    kmovw %edi, %k1
   3025 ; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3026 ; X64-NEXT:    retq
   3027 entry:
   3028   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
   3029   %1 = bitcast i8 %__U to <8 x i1>
   3030   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3031   ret <8 x double> %2
   3032 }
   3033 
   3034 define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3035 ; X86-LABEL: test_mm512_fmsub_round_pd:
   3036 ; X86:       # %bb.0: # %entry
   3037 ; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
   3038 ; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
   3039 ; X86-NEXT:    retl
   3040 ;
   3041 ; X64-LABEL: test_mm512_fmsub_round_pd:
   3042 ; X64:       # %bb.0: # %entry
   3043 ; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3044 ; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
   3045 ; X64-NEXT:    retq
   3046 entry:
   3047   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3048   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
   3049   ret <8 x double> %0
   3050 }
   3051 
   3052 define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   3053 ; X86-LABEL: test_mm512_mask_fmsub_round_pd:
   3054 ; X86:       # %bb.0: # %entry
   3055 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3056 ; X86-NEXT:    kmovw %eax, %k1
   3057 ; X86-NEXT:    vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3058 ; X86-NEXT:    retl
   3059 ;
   3060 ; X64-LABEL: test_mm512_mask_fmsub_round_pd:
   3061 ; X64:       # %bb.0: # %entry
   3062 ; X64-NEXT:    kmovw %edi, %k1
   3063 ; X64-NEXT:    vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3064 ; X64-NEXT:    retq
   3065 entry:
   3066   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3067   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
   3068   %1 = bitcast i8 %__U to <8 x i1>
   3069   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   3070   ret <8 x double> %2
   3071 }
   3072 
   3073 define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3074 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
   3075 ; X86:       # %bb.0: # %entry
   3076 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3077 ; X86-NEXT:    kmovw %eax, %k1
   3078 ; X86-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3079 ; X86-NEXT:    retl
   3080 ;
   3081 ; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
   3082 ; X64:       # %bb.0: # %entry
   3083 ; X64-NEXT:    kmovw %edi, %k1
   3084 ; X64-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3085 ; X64-NEXT:    retq
   3086 entry:
   3087   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3088   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
   3089   %1 = bitcast i8 %__U to <8 x i1>
   3090   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3091   ret <8 x double> %2
   3092 }
   3093 
   3094 define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3095 ; X86-LABEL: test_mm512_fnmadd_round_pd:
   3096 ; X86:       # %bb.0: # %entry
   3097 ; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
   3098 ; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
   3099 ; X86-NEXT:    retl
   3100 ;
   3101 ; X64-LABEL: test_mm512_fnmadd_round_pd:
   3102 ; X64:       # %bb.0: # %entry
   3103 ; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   3104 ; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
   3105 ; X64-NEXT:    retq
   3106 entry:
   3107   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3108   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
   3109   ret <8 x double> %0
   3110 }
   3111 
   3112 define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   3113 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
   3114 ; X86:       # %bb.0: # %entry
   3115 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3116 ; X86-NEXT:    kmovw %eax, %k1
   3117 ; X86-NEXT:    vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3118 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   3119 ; X86-NEXT:    retl
   3120 ;
   3121 ; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
   3122 ; X64:       # %bb.0: # %entry
   3123 ; X64-NEXT:    kmovw %edi, %k1
   3124 ; X64-NEXT:    vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3125 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   3126 ; X64-NEXT:    retq
   3127 entry:
   3128   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3129   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
   3130   %1 = bitcast i8 %__U to <8 x i1>
   3131   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   3132   ret <8 x double> %2
   3133 }
   3134 
   3135 define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3136 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
   3137 ; X86:       # %bb.0: # %entry
   3138 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3139 ; X86-NEXT:    kmovw %eax, %k1
   3140 ; X86-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3141 ; X86-NEXT:    retl
   3142 ;
   3143 ; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
   3144 ; X64:       # %bb.0: # %entry
   3145 ; X64-NEXT:    kmovw %edi, %k1
   3146 ; X64-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3147 ; X64-NEXT:    retq
   3148 entry:
   3149   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3150   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
   3151   %1 = bitcast i8 %__U to <8 x i1>
   3152   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3153   ret <8 x double> %2
   3154 }
   3155 
   3156 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3157 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
   3158 ; CHECK:       # %bb.0: # %entry
   3159 ; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
   3160 ; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
   3161 ; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
   3162 ; CHECK-NEXT:    vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
   3163 ; CHECK-NEXT:    ret{{[l|q]}}
   3164 entry:
   3165   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3166   %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3167   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
   3168   ret <8 x double> %0
   3169 }
   3170 
   3171 define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3172 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
   3173 ; X86:       # %bb.0: # %entry
   3174 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3175 ; X86-NEXT:    kmovw %eax, %k1
   3176 ; X86-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3177 ; X86-NEXT:    retl
   3178 ;
   3179 ; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
   3180 ; X64:       # %bb.0: # %entry
   3181 ; X64-NEXT:    kmovw %edi, %k1
   3182 ; X64-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3183 ; X64-NEXT:    retq
   3184 entry:
   3185   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3186   %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3187   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
   3188   %1 = bitcast i8 %__U to <8 x i1>
   3189   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3190   ret <8 x double> %2
   3191 }
   3192 
   3193 define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3194 ; CHECK-LABEL: test_mm512_fmadd_pd:
   3195 ; CHECK:       # %bb.0: # %entry
   3196 ; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3197 ; CHECK-NEXT:    ret{{[l|q]}}
   3198 entry:
   3199   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   3200   ret <8 x double> %0
   3201 }
   3202 
   3203 define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   3204 ; X86-LABEL: test_mm512_mask_fmadd_pd:
   3205 ; X86:       # %bb.0: # %entry
   3206 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3207 ; X86-NEXT:    kmovw %eax, %k1
   3208 ; X86-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
   3209 ; X86-NEXT:    retl
   3210 ;
   3211 ; X64-LABEL: test_mm512_mask_fmadd_pd:
   3212 ; X64:       # %bb.0: # %entry
   3213 ; X64-NEXT:    kmovw %edi, %k1
   3214 ; X64-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
   3215 ; X64-NEXT:    retq
   3216 entry:
   3217   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   3218   %1 = bitcast i8 %__U to <8 x i1>
   3219   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   3220   ret <8 x double> %2
   3221 }
   3222 
   3223 define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   3224 ; X86-LABEL: test_mm512_mask3_fmadd_pd:
   3225 ; X86:       # %bb.0: # %entry
   3226 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3227 ; X86-NEXT:    kmovw %eax, %k1
   3228 ; X86-NEXT:    vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
   3229 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   3230 ; X86-NEXT:    retl
   3231 ;
   3232 ; X64-LABEL: test_mm512_mask3_fmadd_pd:
   3233 ; X64:       # %bb.0: # %entry
   3234 ; X64-NEXT:    kmovw %edi, %k1
   3235 ; X64-NEXT:    vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
   3236 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   3237 ; X64-NEXT:    retq
   3238 entry:
   3239   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   3240   %1 = bitcast i8 %__U to <8 x i1>
   3241   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   3242   ret <8 x double> %2
   3243 }
   3244 
   3245 define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3246 ; X86-LABEL: test_mm512_maskz_fmadd_pd:
   3247 ; X86:       # %bb.0: # %entry
   3248 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3249 ; X86-NEXT:    kmovw %eax, %k1
   3250 ; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3251 ; X86-NEXT:    retl
   3252 ;
   3253 ; X64-LABEL: test_mm512_maskz_fmadd_pd:
   3254 ; X64:       # %bb.0: # %entry
   3255 ; X64-NEXT:    kmovw %edi, %k1
   3256 ; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3257 ; X64-NEXT:    retq
   3258 entry:
   3259   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   3260   %1 = bitcast i8 %__U to <8 x i1>
   3261   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3262   ret <8 x double> %2
   3263 }
   3264 
   3265 define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3266 ; X86-LABEL: test_mm512_fmsub_pd:
   3267 ; X86:       # %bb.0: # %entry
   3268 ; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
   3269 ; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3270 ; X86-NEXT:    retl
   3271 ;
   3272 ; X64-LABEL: test_mm512_fmsub_pd:
   3273 ; X64:       # %bb.0: # %entry
   3274 ; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3275 ; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3276 ; X64-NEXT:    retq
   3277 entry:
   3278   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3279   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
   3280   ret <8 x double> %0
   3281 }
   3282 
   3283 define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   3284 ; X86-LABEL: test_mm512_mask_fmsub_pd:
   3285 ; X86:       # %bb.0: # %entry
   3286 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3287 ; X86-NEXT:    kmovw %eax, %k1
   3288 ; X86-NEXT:    vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
   3289 ; X86-NEXT:    retl
   3290 ;
   3291 ; X64-LABEL: test_mm512_mask_fmsub_pd:
   3292 ; X64:       # %bb.0: # %entry
   3293 ; X64-NEXT:    kmovw %edi, %k1
   3294 ; X64-NEXT:    vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
   3295 ; X64-NEXT:    retq
   3296 entry:
   3297   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3298   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
   3299   %1 = bitcast i8 %__U to <8 x i1>
   3300   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   3301   ret <8 x double> %2
   3302 }
   3303 
   3304 define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3305 ; X86-LABEL: test_mm512_maskz_fmsub_pd:
   3306 ; X86:       # %bb.0: # %entry
   3307 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3308 ; X86-NEXT:    kmovw %eax, %k1
   3309 ; X86-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
   3310 ; X86-NEXT:    retl
   3311 ;
   3312 ; X64-LABEL: test_mm512_maskz_fmsub_pd:
   3313 ; X64:       # %bb.0: # %entry
   3314 ; X64-NEXT:    kmovw %edi, %k1
   3315 ; X64-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
   3316 ; X64-NEXT:    retq
   3317 entry:
   3318   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3319   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
   3320   %1 = bitcast i8 %__U to <8 x i1>
   3321   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3322   ret <8 x double> %2
   3323 }
   3324 
   3325 define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3326 ; X86-LABEL: test_mm512_fnmadd_pd:
   3327 ; X86:       # %bb.0: # %entry
   3328 ; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
   3329 ; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3330 ; X86-NEXT:    retl
   3331 ;
   3332 ; X64-LABEL: test_mm512_fnmadd_pd:
   3333 ; X64:       # %bb.0: # %entry
   3334 ; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   3335 ; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3336 ; X64-NEXT:    retq
   3337 entry:
   3338   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3339   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
   3340   ret <8 x double> %0
   3341 }
   3342 
   3343 define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   3344 ; X86-LABEL: test_mm512_mask3_fnmadd_pd:
   3345 ; X86:       # %bb.0: # %entry
   3346 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3347 ; X86-NEXT:    kmovw %eax, %k1
   3348 ; X86-NEXT:    vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
   3349 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   3350 ; X86-NEXT:    retl
   3351 ;
   3352 ; X64-LABEL: test_mm512_mask3_fnmadd_pd:
   3353 ; X64:       # %bb.0: # %entry
   3354 ; X64-NEXT:    kmovw %edi, %k1
   3355 ; X64-NEXT:    vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
   3356 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   3357 ; X64-NEXT:    retq
   3358 entry:
   3359   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3360   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
   3361   %1 = bitcast i8 %__U to <8 x i1>
   3362   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   3363   ret <8 x double> %2
   3364 }
   3365 
   3366 define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3367 ; X86-LABEL: test_mm512_maskz_fnmadd_pd:
   3368 ; X86:       # %bb.0: # %entry
   3369 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3370 ; X86-NEXT:    kmovw %eax, %k1
   3371 ; X86-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
   3372 ; X86-NEXT:    retl
   3373 ;
   3374 ; X64-LABEL: test_mm512_maskz_fnmadd_pd:
   3375 ; X64:       # %bb.0: # %entry
   3376 ; X64-NEXT:    kmovw %edi, %k1
   3377 ; X64-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
   3378 ; X64-NEXT:    retq
   3379 entry:
   3380   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3381   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
   3382   %1 = bitcast i8 %__U to <8 x i1>
   3383   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3384   ret <8 x double> %2
   3385 }
   3386 
   3387 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3388 ; CHECK-LABEL: test_mm512_fnmsub_pd:
   3389 ; CHECK:       # %bb.0: # %entry
   3390 ; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
   3391 ; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
   3392 ; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
   3393 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
   3394 ; CHECK-NEXT:    ret{{[l|q]}}
   3395 entry:
   3396   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3397   %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3398   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
   3399   ret <8 x double> %0
   3400 }
   3401 
   3402 define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3403 ; X86-LABEL: test_mm512_maskz_fnmsub_pd:
   3404 ; X86:       # %bb.0: # %entry
   3405 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3406 ; X86-NEXT:    kmovw %eax, %k1
   3407 ; X86-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
   3408 ; X86-NEXT:    retl
   3409 ;
   3410 ; X64-LABEL: test_mm512_maskz_fnmsub_pd:
   3411 ; X64:       # %bb.0: # %entry
   3412 ; X64-NEXT:    kmovw %edi, %k1
   3413 ; X64-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
   3414 ; X64-NEXT:    retq
   3415 entry:
   3416   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   3417   %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3418   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
   3419   %1 = bitcast i8 %__U to <8 x i1>
   3420   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3421   ret <8 x double> %2
   3422 }
   3423 
   3424 define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3425 ; CHECK-LABEL: test_mm512_fmadd_round_ps:
   3426 ; CHECK:       # %bb.0: # %entry
   3427 ; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
   3428 ; CHECK-NEXT:    ret{{[l|q]}}
   3429 entry:
   3430   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
   3431   ret <16 x float> %0
   3432 }
   3433 
   3434 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
   3435 
   3436 define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   3437 ; X86-LABEL: test_mm512_mask_fmadd_round_ps:
   3438 ; X86:       # %bb.0: # %entry
   3439 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3440 ; X86-NEXT:    vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3441 ; X86-NEXT:    retl
   3442 ;
   3443 ; X64-LABEL: test_mm512_mask_fmadd_round_ps:
   3444 ; X64:       # %bb.0: # %entry
   3445 ; X64-NEXT:    kmovw %edi, %k1
   3446 ; X64-NEXT:    vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3447 ; X64-NEXT:    retq
   3448 entry:
   3449   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
   3450   %1 = bitcast i16 %__U to <16 x i1>
   3451   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   3452   ret <16 x float> %2
   3453 }
   3454 
   3455 define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   3456 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
   3457 ; X86:       # %bb.0: # %entry
   3458 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3459 ; X86-NEXT:    vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3460 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   3461 ; X86-NEXT:    retl
   3462 ;
   3463 ; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
   3464 ; X64:       # %bb.0: # %entry
   3465 ; X64-NEXT:    kmovw %edi, %k1
   3466 ; X64-NEXT:    vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3467 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   3468 ; X64-NEXT:    retq
   3469 entry:
   3470   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
   3471   %1 = bitcast i16 %__U to <16 x i1>
   3472   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   3473   ret <16 x float> %2
   3474 }
   3475 
   3476 define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3477 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
   3478 ; X86:       # %bb.0: # %entry
   3479 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3480 ; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3481 ; X86-NEXT:    retl
   3482 ;
   3483 ; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
   3484 ; X64:       # %bb.0: # %entry
   3485 ; X64-NEXT:    kmovw %edi, %k1
   3486 ; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3487 ; X64-NEXT:    retq
   3488 entry:
   3489   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
   3490   %1 = bitcast i16 %__U to <16 x i1>
   3491   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   3492   ret <16 x float> %2
   3493 }
   3494 
   3495 define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3496 ; X86-LABEL: test_mm512_fmsub_round_ps:
   3497 ; X86:       # %bb.0: # %entry
   3498 ; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
   3499 ; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
   3500 ; X86-NEXT:    retl
   3501 ;
   3502 ; X64-LABEL: test_mm512_fmsub_round_ps:
   3503 ; X64:       # %bb.0: # %entry
   3504 ; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3505 ; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
   3506 ; X64-NEXT:    retq
   3507 entry:
   3508   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3509   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
   3510   ret <16 x float> %0
   3511 }
   3512 
   3513 define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   3514 ; X86-LABEL: test_mm512_mask_fmsub_round_ps:
   3515 ; X86:       # %bb.0: # %entry
   3516 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3517 ; X86-NEXT:    vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3518 ; X86-NEXT:    retl
   3519 ;
   3520 ; X64-LABEL: test_mm512_mask_fmsub_round_ps:
   3521 ; X64:       # %bb.0: # %entry
   3522 ; X64-NEXT:    kmovw %edi, %k1
   3523 ; X64-NEXT:    vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3524 ; X64-NEXT:    retq
   3525 entry:
   3526   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3527   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
   3528   %1 = bitcast i16 %__U to <16 x i1>
   3529   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   3530   ret <16 x float> %2
   3531 }
   3532 
   3533 define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3534 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
   3535 ; X86:       # %bb.0: # %entry
   3536 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3537 ; X86-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3538 ; X86-NEXT:    retl
   3539 ;
   3540 ; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
   3541 ; X64:       # %bb.0: # %entry
   3542 ; X64-NEXT:    kmovw %edi, %k1
   3543 ; X64-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3544 ; X64-NEXT:    retq
   3545 entry:
   3546   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3547   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
   3548   %1 = bitcast i16 %__U to <16 x i1>
   3549   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   3550   ret <16 x float> %2
   3551 }
   3552 
   3553 define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3554 ; X86-LABEL: test_mm512_fnmadd_round_ps:
   3555 ; X86:       # %bb.0: # %entry
   3556 ; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
   3557 ; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
   3558 ; X86-NEXT:    retl
   3559 ;
   3560 ; X64-LABEL: test_mm512_fnmadd_round_ps:
   3561 ; X64:       # %bb.0: # %entry
   3562 ; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
   3563 ; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
   3564 ; X64-NEXT:    retq
   3565 entry:
   3566   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3567   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
   3568   ret <16 x float> %0
   3569 }
   3570 
   3571 define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   3572 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
   3573 ; X86:       # %bb.0: # %entry
   3574 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3575 ; X86-NEXT:    vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3576 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   3577 ; X86-NEXT:    retl
   3578 ;
   3579 ; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
   3580 ; X64:       # %bb.0: # %entry
   3581 ; X64-NEXT:    kmovw %edi, %k1
   3582 ; X64-NEXT:    vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3583 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   3584 ; X64-NEXT:    retq
   3585 entry:
   3586   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3587   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
   3588   %1 = bitcast i16 %__U to <16 x i1>
   3589   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   3590   ret <16 x float> %2
   3591 }
   3592 
   3593 define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3594 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
   3595 ; X86:       # %bb.0: # %entry
   3596 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3597 ; X86-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3598 ; X86-NEXT:    retl
   3599 ;
   3600 ; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
   3601 ; X64:       # %bb.0: # %entry
   3602 ; X64-NEXT:    kmovw %edi, %k1
   3603 ; X64-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3604 ; X64-NEXT:    retq
   3605 entry:
   3606   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3607   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
   3608   %1 = bitcast i16 %__U to <16 x i1>
   3609   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   3610   ret <16 x float> %2
   3611 }
   3612 
   3613 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3614 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
   3615 ; CHECK:       # %bb.0: # %entry
   3616 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
   3617 ; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
   3618 ; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
   3619 ; CHECK-NEXT:    vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
   3620 ; CHECK-NEXT:    ret{{[l|q]}}
   3621 entry:
   3622   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3623   %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3624   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
   3625   ret <16 x float> %0
   3626 }
   3627 
   3628 define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3629 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
   3630 ; X86:       # %bb.0: # %entry
   3631 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3632 ; X86-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3633 ; X86-NEXT:    retl
   3634 ;
   3635 ; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
   3636 ; X64:       # %bb.0: # %entry
   3637 ; X64-NEXT:    kmovw %edi, %k1
   3638 ; X64-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3639 ; X64-NEXT:    retq
   3640 entry:
   3641   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3642   %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3643   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
   3644   %1 = bitcast i16 %__U to <16 x i1>
   3645   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   3646   ret <16 x float> %2
   3647 }
   3648 
   3649 define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3650 ; CHECK-LABEL: test_mm512_fmadd_ps:
   3651 ; CHECK:       # %bb.0: # %entry
   3652 ; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3653 ; CHECK-NEXT:    ret{{[l|q]}}
   3654 entry:
   3655   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   3656   ret <16 x float> %0
   3657 }
   3658 
   3659 define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   3660 ; X86-LABEL: test_mm512_mask_fmadd_ps:
   3661 ; X86:       # %bb.0: # %entry
   3662 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3663 ; X86-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
   3664 ; X86-NEXT:    retl
   3665 ;
   3666 ; X64-LABEL: test_mm512_mask_fmadd_ps:
   3667 ; X64:       # %bb.0: # %entry
   3668 ; X64-NEXT:    kmovw %edi, %k1
   3669 ; X64-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
   3670 ; X64-NEXT:    retq
   3671 entry:
   3672   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   3673   %1 = bitcast i16 %__U to <16 x i1>
   3674   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   3675   ret <16 x float> %2
   3676 }
   3677 
   3678 define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   3679 ; X86-LABEL: test_mm512_mask3_fmadd_ps:
   3680 ; X86:       # %bb.0: # %entry
   3681 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3682 ; X86-NEXT:    vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
   3683 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   3684 ; X86-NEXT:    retl
   3685 ;
   3686 ; X64-LABEL: test_mm512_mask3_fmadd_ps:
   3687 ; X64:       # %bb.0: # %entry
   3688 ; X64-NEXT:    kmovw %edi, %k1
   3689 ; X64-NEXT:    vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
   3690 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   3691 ; X64-NEXT:    retq
   3692 entry:
   3693   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   3694   %1 = bitcast i16 %__U to <16 x i1>
   3695   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   3696   ret <16 x float> %2
   3697 }
   3698 
   3699 define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3700 ; X86-LABEL: test_mm512_maskz_fmadd_ps:
   3701 ; X86:       # %bb.0: # %entry
   3702 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3703 ; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3704 ; X86-NEXT:    retl
   3705 ;
   3706 ; X64-LABEL: test_mm512_maskz_fmadd_ps:
   3707 ; X64:       # %bb.0: # %entry
   3708 ; X64-NEXT:    kmovw %edi, %k1
   3709 ; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3710 ; X64-NEXT:    retq
   3711 entry:
   3712   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   3713   %1 = bitcast i16 %__U to <16 x i1>
   3714   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   3715   ret <16 x float> %2
   3716 }
   3717 
   3718 define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3719 ; X86-LABEL: test_mm512_fmsub_ps:
   3720 ; X86:       # %bb.0: # %entry
   3721 ; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
   3722 ; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3723 ; X86-NEXT:    retl
   3724 ;
   3725 ; X64-LABEL: test_mm512_fmsub_ps:
   3726 ; X64:       # %bb.0: # %entry
   3727 ; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3728 ; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3729 ; X64-NEXT:    retq
   3730 entry:
   3731   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3732   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
   3733   ret <16 x float> %0
   3734 }
   3735 
   3736 define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   3737 ; X86-LABEL: test_mm512_mask_fmsub_ps:
   3738 ; X86:       # %bb.0: # %entry
   3739 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3740 ; X86-NEXT:    vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
   3741 ; X86-NEXT:    retl
   3742 ;
   3743 ; X64-LABEL: test_mm512_mask_fmsub_ps:
   3744 ; X64:       # %bb.0: # %entry
   3745 ; X64-NEXT:    kmovw %edi, %k1
   3746 ; X64-NEXT:    vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
   3747 ; X64-NEXT:    retq
   3748 entry:
   3749   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3750   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
   3751   %1 = bitcast i16 %__U to <16 x i1>
   3752   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   3753   ret <16 x float> %2
   3754 }
   3755 
   3756 define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3757 ; X86-LABEL: test_mm512_maskz_fmsub_ps:
   3758 ; X86:       # %bb.0: # %entry
   3759 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3760 ; X86-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
   3761 ; X86-NEXT:    retl
   3762 ;
   3763 ; X64-LABEL: test_mm512_maskz_fmsub_ps:
   3764 ; X64:       # %bb.0: # %entry
   3765 ; X64-NEXT:    kmovw %edi, %k1
   3766 ; X64-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
   3767 ; X64-NEXT:    retq
   3768 entry:
   3769   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3770   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
   3771   %1 = bitcast i16 %__U to <16 x i1>
   3772   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   3773   ret <16 x float> %2
   3774 }
   3775 
   3776 define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3777 ; X86-LABEL: test_mm512_fnmadd_ps:
   3778 ; X86:       # %bb.0: # %entry
   3779 ; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
   3780 ; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3781 ; X86-NEXT:    retl
   3782 ;
   3783 ; X64-LABEL: test_mm512_fnmadd_ps:
   3784 ; X64:       # %bb.0: # %entry
   3785 ; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
   3786 ; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
   3787 ; X64-NEXT:    retq
   3788 entry:
   3789   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3790   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
   3791   ret <16 x float> %0
   3792 }
   3793 
   3794 define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   3795 ; X86-LABEL: test_mm512_mask3_fnmadd_ps:
   3796 ; X86:       # %bb.0: # %entry
   3797 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3798 ; X86-NEXT:    vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
   3799 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   3800 ; X86-NEXT:    retl
   3801 ;
   3802 ; X64-LABEL: test_mm512_mask3_fnmadd_ps:
   3803 ; X64:       # %bb.0: # %entry
   3804 ; X64-NEXT:    kmovw %edi, %k1
   3805 ; X64-NEXT:    vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
   3806 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   3807 ; X64-NEXT:    retq
   3808 entry:
   3809   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3810   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
   3811   %1 = bitcast i16 %__U to <16 x i1>
   3812   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   3813   ret <16 x float> %2
   3814 }
   3815 
   3816 define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3817 ; X86-LABEL: test_mm512_maskz_fnmadd_ps:
   3818 ; X86:       # %bb.0: # %entry
   3819 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3820 ; X86-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
   3821 ; X86-NEXT:    retl
   3822 ;
   3823 ; X64-LABEL: test_mm512_maskz_fnmadd_ps:
   3824 ; X64:       # %bb.0: # %entry
   3825 ; X64-NEXT:    kmovw %edi, %k1
   3826 ; X64-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
   3827 ; X64-NEXT:    retq
   3828 entry:
   3829   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3830   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
   3831   %1 = bitcast i16 %__U to <16 x i1>
   3832   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   3833   ret <16 x float> %2
   3834 }
   3835 
   3836 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3837 ; CHECK-LABEL: test_mm512_fnmsub_ps:
   3838 ; CHECK:       # %bb.0: # %entry
   3839 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
   3840 ; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
   3841 ; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
   3842 ; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
   3843 ; CHECK-NEXT:    ret{{[l|q]}}
   3844 entry:
   3845   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3846   %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3847   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
   3848   ret <16 x float> %0
   3849 }
   3850 
   3851 define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   3852 ; X86-LABEL: test_mm512_maskz_fnmsub_ps:
   3853 ; X86:       # %bb.0: # %entry
   3854 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   3855 ; X86-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
   3856 ; X86-NEXT:    retl
   3857 ;
   3858 ; X64-LABEL: test_mm512_maskz_fnmsub_ps:
   3859 ; X64:       # %bb.0: # %entry
   3860 ; X64-NEXT:    kmovw %edi, %k1
   3861 ; X64-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
   3862 ; X64-NEXT:    retq
   3863 entry:
   3864   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   3865   %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   3866   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
   3867   %1 = bitcast i16 %__U to <16 x i1>
   3868   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   3869   ret <16 x float> %2
   3870 }
   3871 
   3872 define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3873 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
   3874 ; CHECK:       # %bb.0: # %entry
   3875 ; CHECK-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
   3876 ; CHECK-NEXT:    ret{{[l|q]}}
   3877 entry:
   3878   %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
   3879   ret <8 x double> %0
   3880 }
   3881 
   3882 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
   3883 
   3884 define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   3885 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
   3886 ; X86:       # %bb.0: # %entry
   3887 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3888 ; X86-NEXT:    kmovw %eax, %k1
   3889 ; X86-NEXT:    vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3890 ; X86-NEXT:    retl
   3891 ;
   3892 ; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
   3893 ; X64:       # %bb.0: # %entry
   3894 ; X64-NEXT:    kmovw %edi, %k1
   3895 ; X64-NEXT:    vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3896 ; X64-NEXT:    retq
   3897 entry:
   3898   %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
   3899   %1 = bitcast i8 %__U to <8 x i1>
   3900   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   3901   ret <8 x double> %2
   3902 }
   3903 
   3904 define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   3905 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
   3906 ; X86:       # %bb.0: # %entry
   3907 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3908 ; X86-NEXT:    kmovw %eax, %k1
   3909 ; X86-NEXT:    vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3910 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   3911 ; X86-NEXT:    retl
   3912 ;
   3913 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
   3914 ; X64:       # %bb.0: # %entry
   3915 ; X64-NEXT:    kmovw %edi, %k1
   3916 ; X64-NEXT:    vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   3917 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   3918 ; X64-NEXT:    retq
   3919 entry:
   3920   %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
   3921   %1 = bitcast i8 %__U to <8 x i1>
   3922   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   3923   ret <8 x double> %2
   3924 }
   3925 
   3926 define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3927 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
   3928 ; X86:       # %bb.0: # %entry
   3929 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3930 ; X86-NEXT:    kmovw %eax, %k1
   3931 ; X86-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3932 ; X86-NEXT:    retl
   3933 ;
   3934 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
   3935 ; X64:       # %bb.0: # %entry
   3936 ; X64-NEXT:    kmovw %edi, %k1
   3937 ; X64-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3938 ; X64-NEXT:    retq
   3939 entry:
   3940   %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
   3941   %1 = bitcast i8 %__U to <8 x i1>
   3942   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   3943   ret <8 x double> %2
   3944 }
   3945 
   3946 define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3947 ; X86-LABEL: test_mm512_fmsubadd_round_pd:
   3948 ; X86:       # %bb.0: # %entry
   3949 ; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
   3950 ; X86-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
   3951 ; X86-NEXT:    retl
   3952 ;
   3953 ; X64-LABEL: test_mm512_fmsubadd_round_pd:
   3954 ; X64:       # %bb.0: # %entry
   3955 ; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3956 ; X64-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
   3957 ; X64-NEXT:    retq
   3958 entry:
   3959   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3960   %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
   3961   ret <8 x double> %0
   3962 }
   3963 
   3964 define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   3965 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
   3966 ; X86:       # %bb.0: # %entry
   3967 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3968 ; X86-NEXT:    kmovw %eax, %k1
   3969 ; X86-NEXT:    vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3970 ; X86-NEXT:    retl
   3971 ;
   3972 ; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
   3973 ; X64:       # %bb.0: # %entry
   3974 ; X64-NEXT:    kmovw %edi, %k1
   3975 ; X64-NEXT:    vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   3976 ; X64-NEXT:    retq
   3977 entry:
   3978   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   3979   %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
   3980   %1 = bitcast i8 %__U to <8 x i1>
   3981   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   3982   ret <8 x double> %2
   3983 }
   3984 
   3985 define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   3986 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
   3987 ; X86:       # %bb.0: # %entry
   3988 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   3989 ; X86-NEXT:    kmovw %eax, %k1
   3990 ; X86-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3991 ; X86-NEXT:    retl
   3992 ;
   3993 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
   3994 ; X64:       # %bb.0: # %entry
   3995 ; X64-NEXT:    kmovw %edi, %k1
   3996 ; X64-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   3997 ; X64-NEXT:    retq
   3998 entry:
   3999   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4000   %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
   4001   %1 = bitcast i8 %__U to <8 x i1>
   4002   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   4003   ret <8 x double> %2
   4004 }
   4005 
   4006 define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   4007 ; CHECK-LABEL: test_mm512_fmaddsub_pd:
   4008 ; CHECK:       # %bb.0: # %entry
   4009 ; CHECK-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
   4010 ; CHECK-NEXT:    ret{{[l|q]}}
   4011 entry:
   4012   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   4013   %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4014   %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
   4015   %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   4016   ret <8 x double> %3
   4017 }
   4018 
   4019 define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   4020 ; X86-LABEL: test_mm512_mask_fmaddsub_pd:
   4021 ; X86:       # %bb.0: # %entry
   4022 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4023 ; X86-NEXT:    kmovw %eax, %k1
   4024 ; X86-NEXT:    vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
   4025 ; X86-NEXT:    retl
   4026 ;
   4027 ; X64-LABEL: test_mm512_mask_fmaddsub_pd:
   4028 ; X64:       # %bb.0: # %entry
   4029 ; X64-NEXT:    kmovw %edi, %k1
   4030 ; X64-NEXT:    vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
   4031 ; X64-NEXT:    retq
   4032 entry:
   4033   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   4034   %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4035   %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
   4036   %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   4037   %4 = bitcast i8 %__U to <8 x i1>
   4038   %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
   4039   ret <8 x double> %5
   4040 }
   4041 
   4042 define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   4043 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
   4044 ; X86:       # %bb.0: # %entry
   4045 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4046 ; X86-NEXT:    kmovw %eax, %k1
   4047 ; X86-NEXT:    vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
   4048 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   4049 ; X86-NEXT:    retl
   4050 ;
   4051 ; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
   4052 ; X64:       # %bb.0: # %entry
   4053 ; X64-NEXT:    kmovw %edi, %k1
   4054 ; X64-NEXT:    vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
   4055 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   4056 ; X64-NEXT:    retq
   4057 entry:
   4058   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   4059   %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4060   %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
   4061   %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   4062   %4 = bitcast i8 %__U to <8 x i1>
   4063   %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
   4064   ret <8 x double> %5
   4065 }
   4066 
   4067 define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   4068 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
   4069 ; X86:       # %bb.0: # %entry
   4070 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4071 ; X86-NEXT:    kmovw %eax, %k1
   4072 ; X86-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
   4073 ; X86-NEXT:    retl
   4074 ;
   4075 ; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
   4076 ; X64:       # %bb.0: # %entry
   4077 ; X64-NEXT:    kmovw %edi, %k1
   4078 ; X64-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
   4079 ; X64-NEXT:    retq
   4080 entry:
   4081   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   4082   %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4083   %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
   4084   %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   4085   %4 = bitcast i8 %__U to <8 x i1>
   4086   %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
   4087   ret <8 x double> %5
   4088 }
   4089 
   4090 define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   4091 ; CHECK-LABEL: test_mm512_fmsubadd_pd:
   4092 ; CHECK:       # %bb.0: # %entry
   4093 ; CHECK-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
   4094 ; CHECK-NEXT:    ret{{[l|q]}}
   4095 entry:
   4096   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4097   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
   4098   %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   4099   %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   4100   ret <8 x double> %2
   4101 }
   4102 
   4103 define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   4104 ; X86-LABEL: test_mm512_mask_fmsubadd_pd:
   4105 ; X86:       # %bb.0: # %entry
   4106 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4107 ; X86-NEXT:    kmovw %eax, %k1
   4108 ; X86-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
   4109 ; X86-NEXT:    retl
   4110 ;
   4111 ; X64-LABEL: test_mm512_mask_fmsubadd_pd:
   4112 ; X64:       # %bb.0: # %entry
   4113 ; X64-NEXT:    kmovw %edi, %k1
   4114 ; X64-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
   4115 ; X64-NEXT:    retq
   4116 entry:
   4117   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4118   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
   4119   %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   4120   %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   4121   %3 = bitcast i8 %__U to <8 x i1>
   4122   %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
   4123   ret <8 x double> %4
   4124 }
   4125 
   4126 define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
   4127 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
   4128 ; X86:       # %bb.0: # %entry
   4129 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4130 ; X86-NEXT:    kmovw %eax, %k1
   4131 ; X86-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
   4132 ; X86-NEXT:    retl
   4133 ;
   4134 ; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
   4135 ; X64:       # %bb.0: # %entry
   4136 ; X64-NEXT:    kmovw %edi, %k1
   4137 ; X64-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
   4138 ; X64-NEXT:    retq
   4139 entry:
   4140   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4141   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
   4142   %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   4143   %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   4144   %3 = bitcast i8 %__U to <8 x i1>
   4145   %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
   4146   ret <8 x double> %4
   4147 }
   4148 
   4149 define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   4150 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
   4151 ; CHECK:       # %bb.0: # %entry
   4152 ; CHECK-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
   4153 ; CHECK-NEXT:    ret{{[l|q]}}
   4154 entry:
   4155   %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
   4156   ret <16 x float> %0
   4157 }
   4158 
   4159 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
   4160 
   4161 define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   4162 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
   4163 ; X86:       # %bb.0: # %entry
   4164 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4165 ; X86-NEXT:    vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4166 ; X86-NEXT:    retl
   4167 ;
   4168 ; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
   4169 ; X64:       # %bb.0: # %entry
   4170 ; X64-NEXT:    kmovw %edi, %k1
   4171 ; X64-NEXT:    vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4172 ; X64-NEXT:    retq
   4173 entry:
   4174   %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
   4175   %1 = bitcast i16 %__U to <16 x i1>
   4176   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   4177   ret <16 x float> %2
   4178 }
   4179 
   4180 define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   4181 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
   4182 ; X86:       # %bb.0: # %entry
   4183 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4184 ; X86-NEXT:    vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4185 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   4186 ; X86-NEXT:    retl
   4187 ;
   4188 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
   4189 ; X64:       # %bb.0: # %entry
   4190 ; X64-NEXT:    kmovw %edi, %k1
   4191 ; X64-NEXT:    vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4192 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   4193 ; X64-NEXT:    retq
   4194 entry:
   4195   %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
   4196   %1 = bitcast i16 %__U to <16 x i1>
   4197   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   4198   ret <16 x float> %2
   4199 }
   4200 
   4201 define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   4202 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
   4203 ; X86:       # %bb.0: # %entry
   4204 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4205 ; X86-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   4206 ; X86-NEXT:    retl
   4207 ;
   4208 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
   4209 ; X64:       # %bb.0: # %entry
   4210 ; X64-NEXT:    kmovw %edi, %k1
   4211 ; X64-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   4212 ; X64-NEXT:    retq
   4213 entry:
   4214   %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
   4215   %1 = bitcast i16 %__U to <16 x i1>
   4216   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   4217   ret <16 x float> %2
   4218 }
   4219 
   4220 define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   4221 ; X86-LABEL: test_mm512_fmsubadd_round_ps:
   4222 ; X86:       # %bb.0: # %entry
   4223 ; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
   4224 ; X86-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
   4225 ; X86-NEXT:    retl
   4226 ;
   4227 ; X64-LABEL: test_mm512_fmsubadd_round_ps:
   4228 ; X64:       # %bb.0: # %entry
   4229 ; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
   4230 ; X64-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
   4231 ; X64-NEXT:    retq
   4232 entry:
   4233   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4234   %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
   4235   ret <16 x float> %0
   4236 }
   4237 
   4238 define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   4239 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
   4240 ; X86:       # %bb.0: # %entry
   4241 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4242 ; X86-NEXT:    vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4243 ; X86-NEXT:    retl
   4244 ;
   4245 ; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
   4246 ; X64:       # %bb.0: # %entry
   4247 ; X64-NEXT:    kmovw %edi, %k1
   4248 ; X64-NEXT:    vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4249 ; X64-NEXT:    retq
   4250 entry:
   4251   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4252   %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
   4253   %1 = bitcast i16 %__U to <16 x i1>
   4254   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   4255   ret <16 x float> %2
   4256 }
   4257 
   4258 define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   4259 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
   4260 ; X86:       # %bb.0: # %entry
   4261 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4262 ; X86-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   4263 ; X86-NEXT:    retl
   4264 ;
   4265 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
   4266 ; X64:       # %bb.0: # %entry
   4267 ; X64-NEXT:    kmovw %edi, %k1
   4268 ; X64-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
   4269 ; X64-NEXT:    retq
   4270 entry:
   4271   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4272   %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
   4273   %1 = bitcast i16 %__U to <16 x i1>
   4274   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   4275   ret <16 x float> %2
   4276 }
   4277 
   4278 define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   4279 ; CHECK-LABEL: test_mm512_fmaddsub_ps:
   4280 ; CHECK:       # %bb.0: # %entry
   4281 ; CHECK-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
   4282 ; CHECK-NEXT:    ret{{[l|q]}}
   4283 entry:
   4284   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   4285   %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4286   %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
   4287   %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   4288   ret <16 x float> %3
   4289 }
   4290 
   4291 define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   4292 ; X86-LABEL: test_mm512_mask_fmaddsub_ps:
   4293 ; X86:       # %bb.0: # %entry
   4294 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4295 ; X86-NEXT:    vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
   4296 ; X86-NEXT:    retl
   4297 ;
   4298 ; X64-LABEL: test_mm512_mask_fmaddsub_ps:
   4299 ; X64:       # %bb.0: # %entry
   4300 ; X64-NEXT:    kmovw %edi, %k1
   4301 ; X64-NEXT:    vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
   4302 ; X64-NEXT:    retq
   4303 entry:
   4304   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   4305   %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4306   %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
   4307   %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   4308   %4 = bitcast i16 %__U to <16 x i1>
   4309   %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
   4310   ret <16 x float> %5
   4311 }
   4312 
   4313 define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   4314 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
   4315 ; X86:       # %bb.0: # %entry
   4316 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4317 ; X86-NEXT:    vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
   4318 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   4319 ; X86-NEXT:    retl
   4320 ;
   4321 ; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
   4322 ; X64:       # %bb.0: # %entry
   4323 ; X64-NEXT:    kmovw %edi, %k1
   4324 ; X64-NEXT:    vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
   4325 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   4326 ; X64-NEXT:    retq
   4327 entry:
   4328   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   4329   %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4330   %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
   4331   %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   4332   %4 = bitcast i16 %__U to <16 x i1>
   4333   %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
   4334   ret <16 x float> %5
   4335 }
   4336 
   4337 define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   4338 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
   4339 ; X86:       # %bb.0: # %entry
   4340 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4341 ; X86-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
   4342 ; X86-NEXT:    retl
   4343 ;
   4344 ; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
   4345 ; X64:       # %bb.0: # %entry
   4346 ; X64-NEXT:    kmovw %edi, %k1
   4347 ; X64-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
   4348 ; X64-NEXT:    retq
   4349 entry:
   4350   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   4351   %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4352   %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
   4353   %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   4354   %4 = bitcast i16 %__U to <16 x i1>
   4355   %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
   4356   ret <16 x float> %5
   4357 }
   4358 
   4359 define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   4360 ; CHECK-LABEL: test_mm512_fmsubadd_ps:
   4361 ; CHECK:       # %bb.0: # %entry
   4362 ; CHECK-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
   4363 ; CHECK-NEXT:    ret{{[l|q]}}
   4364 entry:
   4365   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4366   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
   4367   %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   4368   %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   4369   ret <16 x float> %2
   4370 }
   4371 
   4372 define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   4373 ; X86-LABEL: test_mm512_mask_fmsubadd_ps:
   4374 ; X86:       # %bb.0: # %entry
   4375 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4376 ; X86-NEXT:    vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
   4377 ; X86-NEXT:    retl
   4378 ;
   4379 ; X64-LABEL: test_mm512_mask_fmsubadd_ps:
   4380 ; X64:       # %bb.0: # %entry
   4381 ; X64-NEXT:    kmovw %edi, %k1
   4382 ; X64-NEXT:    vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
   4383 ; X64-NEXT:    retq
   4384 entry:
   4385   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4386   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
   4387   %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   4388   %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   4389   %3 = bitcast i16 %__U to <16 x i1>
   4390   %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
   4391   ret <16 x float> %4
   4392 }
   4393 
   4394 define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
   4395 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
   4396 ; X86:       # %bb.0: # %entry
   4397 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4398 ; X86-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
   4399 ; X86-NEXT:    retl
   4400 ;
   4401 ; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
   4402 ; X64:       # %bb.0: # %entry
   4403 ; X64-NEXT:    kmovw %edi, %k1
   4404 ; X64-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
   4405 ; X64-NEXT:    retq
   4406 entry:
   4407   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4408   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
   4409   %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   4410   %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   4411   %3 = bitcast i16 %__U to <16 x i1>
   4412   %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
   4413   ret <16 x float> %4
   4414 }
   4415 
   4416 define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   4417 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
   4418 ; X86:       # %bb.0: # %entry
   4419 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4420 ; X86-NEXT:    kmovw %eax, %k1
   4421 ; X86-NEXT:    vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4422 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   4423 ; X86-NEXT:    retl
   4424 ;
   4425 ; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
   4426 ; X64:       # %bb.0: # %entry
   4427 ; X64-NEXT:    kmovw %edi, %k1
   4428 ; X64-NEXT:    vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4429 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   4430 ; X64-NEXT:    retq
   4431 entry:
   4432   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4433   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
   4434   %1 = bitcast i8 %__U to <8 x i1>
   4435   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   4436   ret <8 x double> %2
   4437 }
   4438 
   4439 define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   4440 ; X86-LABEL: test_mm512_mask3_fmsub_pd:
   4441 ; X86:       # %bb.0: # %entry
   4442 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4443 ; X86-NEXT:    kmovw %eax, %k1
   4444 ; X86-NEXT:    vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
   4445 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   4446 ; X86-NEXT:    retl
   4447 ;
   4448 ; X64-LABEL: test_mm512_mask3_fmsub_pd:
   4449 ; X64:       # %bb.0: # %entry
   4450 ; X64-NEXT:    kmovw %edi, %k1
   4451 ; X64-NEXT:    vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
   4452 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   4453 ; X64-NEXT:    retq
   4454 entry:
   4455   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4456   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
   4457   %1 = bitcast i8 %__U to <8 x i1>
   4458   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   4459   ret <8 x double> %2
   4460 }
   4461 
   4462 define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   4463 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
   4464 ; X86:       # %bb.0: # %entry
   4465 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4466 ; X86-NEXT:    vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4467 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   4468 ; X86-NEXT:    retl
   4469 ;
   4470 ; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
   4471 ; X64:       # %bb.0: # %entry
   4472 ; X64-NEXT:    kmovw %edi, %k1
   4473 ; X64-NEXT:    vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4474 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   4475 ; X64-NEXT:    retq
   4476 entry:
   4477   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4478   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
   4479   %1 = bitcast i16 %__U to <16 x i1>
   4480   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   4481   ret <16 x float> %2
   4482 }
   4483 
   4484 define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   4485 ; X86-LABEL: test_mm512_mask3_fmsub_ps:
   4486 ; X86:       # %bb.0: # %entry
   4487 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4488 ; X86-NEXT:    vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
   4489 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   4490 ; X86-NEXT:    retl
   4491 ;
   4492 ; X64-LABEL: test_mm512_mask3_fmsub_ps:
   4493 ; X64:       # %bb.0: # %entry
   4494 ; X64-NEXT:    kmovw %edi, %k1
   4495 ; X64-NEXT:    vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
   4496 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   4497 ; X64-NEXT:    retq
   4498 entry:
   4499   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4500   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
   4501   %1 = bitcast i16 %__U to <16 x i1>
   4502   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   4503   ret <16 x float> %2
   4504 }
   4505 
   4506 define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   4507 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
   4508 ; X86:       # %bb.0: # %entry
   4509 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4510 ; X86-NEXT:    kmovw %eax, %k1
   4511 ; X86-NEXT:    vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4512 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   4513 ; X86-NEXT:    retl
   4514 ;
   4515 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
   4516 ; X64:       # %bb.0: # %entry
   4517 ; X64-NEXT:    kmovw %edi, %k1
   4518 ; X64-NEXT:    vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4519 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   4520 ; X64-NEXT:    retq
   4521 entry:
   4522   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4523   %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
   4524   %1 = bitcast i8 %__U to <8 x i1>
   4525   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   4526   ret <8 x double> %2
   4527 }
   4528 
   4529 define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   4530 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
   4531 ; X86:       # %bb.0: # %entry
   4532 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4533 ; X86-NEXT:    kmovw %eax, %k1
   4534 ; X86-NEXT:    vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
   4535 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   4536 ; X86-NEXT:    retl
   4537 ;
   4538 ; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
   4539 ; X64:       # %bb.0: # %entry
   4540 ; X64-NEXT:    kmovw %edi, %k1
   4541 ; X64-NEXT:    vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
   4542 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   4543 ; X64-NEXT:    retq
   4544 entry:
   4545   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4546   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
   4547   %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
   4548   %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   4549   %3 = bitcast i8 %__U to <8 x i1>
   4550   %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
   4551   ret <8 x double> %4
   4552 }
   4553 
   4554 define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   4555 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
   4556 ; X86:       # %bb.0: # %entry
   4557 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4558 ; X86-NEXT:    vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4559 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   4560 ; X86-NEXT:    retl
   4561 ;
   4562 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
   4563 ; X64:       # %bb.0: # %entry
   4564 ; X64-NEXT:    kmovw %edi, %k1
   4565 ; X64-NEXT:    vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4566 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   4567 ; X64-NEXT:    retq
   4568 entry:
   4569   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4570   %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
   4571   %1 = bitcast i16 %__U to <16 x i1>
   4572   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   4573   ret <16 x float> %2
   4574 }
   4575 
   4576 define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   4577 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
   4578 ; X86:       # %bb.0: # %entry
   4579 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4580 ; X86-NEXT:    vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
   4581 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   4582 ; X86-NEXT:    retl
   4583 ;
   4584 ; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
   4585 ; X64:       # %bb.0: # %entry
   4586 ; X64-NEXT:    kmovw %edi, %k1
   4587 ; X64-NEXT:    vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
   4588 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   4589 ; X64-NEXT:    retq
   4590 entry:
   4591   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4592   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
   4593   %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
   4594   %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   4595   %3 = bitcast i16 %__U to <16 x i1>
   4596   %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
   4597   ret <16 x float> %4
   4598 }
   4599 
   4600 define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   4601 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
   4602 ; X86:       # %bb.0: # %entry
   4603 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4604 ; X86-NEXT:    kmovw %eax, %k1
   4605 ; X86-NEXT:    vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4606 ; X86-NEXT:    retl
   4607 ;
   4608 ; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
   4609 ; X64:       # %bb.0: # %entry
   4610 ; X64-NEXT:    kmovw %edi, %k1
   4611 ; X64-NEXT:    vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4612 ; X64-NEXT:    retq
   4613 entry:
   4614   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   4615   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
   4616   %1 = bitcast i8 %__U to <8 x i1>
   4617   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   4618   ret <8 x double> %2
   4619 }
   4620 
   4621 define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   4622 ; X86-LABEL: test_mm512_mask_fnmadd_pd:
   4623 ; X86:       # %bb.0: # %entry
   4624 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4625 ; X86-NEXT:    kmovw %eax, %k1
   4626 ; X86-NEXT:    vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
   4627 ; X86-NEXT:    retl
   4628 ;
   4629 ; X64-LABEL: test_mm512_mask_fnmadd_pd:
   4630 ; X64:       # %bb.0: # %entry
   4631 ; X64-NEXT:    kmovw %edi, %k1
   4632 ; X64-NEXT:    vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
   4633 ; X64-NEXT:    retq
   4634 entry:
   4635   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   4636   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
   4637   %1 = bitcast i8 %__U to <8 x i1>
   4638   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   4639   ret <8 x double> %2
   4640 }
   4641 
   4642 define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   4643 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
   4644 ; X86:       # %bb.0: # %entry
   4645 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4646 ; X86-NEXT:    vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4647 ; X86-NEXT:    retl
   4648 ;
   4649 ; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
   4650 ; X64:       # %bb.0: # %entry
   4651 ; X64-NEXT:    kmovw %edi, %k1
   4652 ; X64-NEXT:    vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4653 ; X64-NEXT:    retq
   4654 entry:
   4655   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   4656   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
   4657   %1 = bitcast i16 %__U to <16 x i1>
   4658   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   4659   ret <16 x float> %2
   4660 }
   4661 
   4662 define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   4663 ; X86-LABEL: test_mm512_mask_fnmadd_ps:
   4664 ; X86:       # %bb.0: # %entry
   4665 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4666 ; X86-NEXT:    vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
   4667 ; X86-NEXT:    retl
   4668 ;
   4669 ; X64-LABEL: test_mm512_mask_fnmadd_ps:
   4670 ; X64:       # %bb.0: # %entry
   4671 ; X64-NEXT:    kmovw %edi, %k1
   4672 ; X64-NEXT:    vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
   4673 ; X64-NEXT:    retq
   4674 entry:
   4675   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   4676   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
   4677   %1 = bitcast i16 %__U to <16 x i1>
   4678   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   4679   ret <16 x float> %2
   4680 }
   4681 
   4682 define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   4683 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
   4684 ; X86:       # %bb.0: # %entry
   4685 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4686 ; X86-NEXT:    kmovw %eax, %k1
   4687 ; X86-NEXT:    vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4688 ; X86-NEXT:    retl
   4689 ;
   4690 ; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
   4691 ; X64:       # %bb.0: # %entry
   4692 ; X64-NEXT:    kmovw %edi, %k1
   4693 ; X64-NEXT:    vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4694 ; X64-NEXT:    retq
   4695 entry:
   4696   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
   4697   %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4698   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
   4699   %1 = bitcast i8 %__U to <8 x i1>
   4700   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   4701   ret <8 x double> %2
   4702 }
   4703 
   4704 define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   4705 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
   4706 ; X86:       # %bb.0: # %entry
   4707 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4708 ; X86-NEXT:    kmovw %eax, %k1
   4709 ; X86-NEXT:    vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4710 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   4711 ; X86-NEXT:    retl
   4712 ;
   4713 ; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
   4714 ; X64:       # %bb.0: # %entry
   4715 ; X64-NEXT:    kmovw %edi, %k1
   4716 ; X64-NEXT:    vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4717 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   4718 ; X64-NEXT:    retq
   4719 entry:
   4720   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
   4721   %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4722   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
   4723   %1 = bitcast i8 %__U to <8 x i1>
   4724   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   4725   ret <8 x double> %2
   4726 }
   4727 
   4728 define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
   4729 ; X86-LABEL: test_mm512_mask_fnmsub_pd:
   4730 ; X86:       # %bb.0: # %entry
   4731 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4732 ; X86-NEXT:    kmovw %eax, %k1
   4733 ; X86-NEXT:    vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
   4734 ; X86-NEXT:    retl
   4735 ;
   4736 ; X64-LABEL: test_mm512_mask_fnmsub_pd:
   4737 ; X64:       # %bb.0: # %entry
   4738 ; X64-NEXT:    kmovw %edi, %k1
   4739 ; X64-NEXT:    vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
   4740 ; X64-NEXT:    retq
   4741 entry:
   4742   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
   4743   %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4744   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
   4745   %1 = bitcast i8 %__U to <8 x i1>
   4746   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
   4747   ret <8 x double> %2
   4748 }
   4749 
   4750 define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
   4751 ; X86-LABEL: test_mm512_mask3_fnmsub_pd:
   4752 ; X86:       # %bb.0: # %entry
   4753 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4754 ; X86-NEXT:    kmovw %eax, %k1
   4755 ; X86-NEXT:    vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
   4756 ; X86-NEXT:    vmovapd %zmm2, %zmm0
   4757 ; X86-NEXT:    retl
   4758 ;
   4759 ; X64-LABEL: test_mm512_mask3_fnmsub_pd:
   4760 ; X64:       # %bb.0: # %entry
   4761 ; X64-NEXT:    kmovw %edi, %k1
   4762 ; X64-NEXT:    vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
   4763 ; X64-NEXT:    vmovapd %zmm2, %zmm0
   4764 ; X64-NEXT:    retq
   4765 entry:
   4766   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
   4767   %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
   4768   %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
   4769   %1 = bitcast i8 %__U to <8 x i1>
   4770   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
   4771   ret <8 x double> %2
   4772 }
   4773 
   4774 define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   4775 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
   4776 ; X86:       # %bb.0: # %entry
   4777 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4778 ; X86-NEXT:    vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4779 ; X86-NEXT:    retl
   4780 ;
   4781 ; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
   4782 ; X64:       # %bb.0: # %entry
   4783 ; X64-NEXT:    kmovw %edi, %k1
   4784 ; X64-NEXT:    vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
   4785 ; X64-NEXT:    retq
   4786 entry:
   4787   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   4788   %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4789   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
   4790   %1 = bitcast i16 %__U to <16 x i1>
   4791   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   4792   ret <16 x float> %2
   4793 }
   4794 
   4795 define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   4796 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
   4797 ; X86:       # %bb.0: # %entry
   4798 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4799 ; X86-NEXT:    vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4800 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   4801 ; X86-NEXT:    retl
   4802 ;
   4803 ; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
   4804 ; X64:       # %bb.0: # %entry
   4805 ; X64-NEXT:    kmovw %edi, %k1
   4806 ; X64-NEXT:    vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
   4807 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   4808 ; X64-NEXT:    retq
   4809 entry:
   4810   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   4811   %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4812   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
   4813   %1 = bitcast i16 %__U to <16 x i1>
   4814   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   4815   ret <16 x float> %2
   4816 }
   4817 
   4818 define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
   4819 ; X86-LABEL: test_mm512_mask_fnmsub_ps:
   4820 ; X86:       # %bb.0: # %entry
   4821 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4822 ; X86-NEXT:    vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
   4823 ; X86-NEXT:    retl
   4824 ;
   4825 ; X64-LABEL: test_mm512_mask_fnmsub_ps:
   4826 ; X64:       # %bb.0: # %entry
   4827 ; X64-NEXT:    kmovw %edi, %k1
   4828 ; X64-NEXT:    vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
   4829 ; X64-NEXT:    retq
   4830 entry:
   4831   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   4832   %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4833   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
   4834   %1 = bitcast i16 %__U to <16 x i1>
   4835   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
   4836   ret <16 x float> %2
   4837 }
   4838 
   4839 define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
   4840 ; X86-LABEL: test_mm512_mask3_fnmsub_ps:
   4841 ; X86:       # %bb.0: # %entry
   4842 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   4843 ; X86-NEXT:    vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
   4844 ; X86-NEXT:    vmovaps %zmm2, %zmm0
   4845 ; X86-NEXT:    retl
   4846 ;
   4847 ; X64-LABEL: test_mm512_mask3_fnmsub_ps:
   4848 ; X64:       # %bb.0: # %entry
   4849 ; X64-NEXT:    kmovw %edi, %k1
   4850 ; X64-NEXT:    vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
   4851 ; X64-NEXT:    vmovaps %zmm2, %zmm0
   4852 ; X64-NEXT:    retq
   4853 entry:
   4854   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
   4855   %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
   4856   %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
   4857   %1 = bitcast i16 %__U to <16 x i1>
   4858   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
   4859   ret <16 x float> %2
   4860 }
   4861 
   4862 define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   4863 ; X86-LABEL: test_mm_mask_fmadd_ss:
   4864 ; X86:       # %bb.0: # %entry
   4865 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4866 ; X86-NEXT:    kmovw %eax, %k1
   4867 ; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4868 ; X86-NEXT:    retl
   4869 ;
   4870 ; X64-LABEL: test_mm_mask_fmadd_ss:
   4871 ; X64:       # %bb.0: # %entry
   4872 ; X64-NEXT:    kmovw %edi, %k1
   4873 ; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4874 ; X64-NEXT:    retq
   4875 entry:
   4876   %0 = extractelement <4 x float> %__W, i64 0
   4877   %1 = extractelement <4 x float> %__A, i64 0
   4878   %2 = extractelement <4 x float> %__B, i64 0
   4879   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   4880   %4 = and i8 %__U, 1
   4881   %tobool.i = icmp eq i8 %4, 0
   4882   %vecext1.i = extractelement <4 x float> %__W, i32 0
   4883   %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
   4884   %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
   4885   ret <4 x float> %vecins.i
   4886 }
   4887 
   4888 define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   4889 ; X86-LABEL: test_mm_mask_fmadd_round_ss:
   4890 ; X86:       # %bb.0: # %entry
   4891 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4892 ; X86-NEXT:    kmovw %eax, %k1
   4893 ; X86-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   4894 ; X86-NEXT:    retl
   4895 ;
   4896 ; X64-LABEL: test_mm_mask_fmadd_round_ss:
   4897 ; X64:       # %bb.0: # %entry
   4898 ; X64-NEXT:    kmovw %edi, %k1
   4899 ; X64-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   4900 ; X64-NEXT:    retq
   4901 entry:
   4902   %0 = extractelement <4 x float> %__W, i64 0
   4903   %1 = extractelement <4 x float> %__A, i64 0
   4904   %2 = extractelement <4 x float> %__B, i64 0
   4905   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   4906   %4 = bitcast i8 %__U to <8 x i1>
   4907   %5 = extractelement <8 x i1> %4, i64 0
   4908   %6 = select i1 %5, float %3, float %0
   4909   %7 = insertelement <4 x float> %__W, float %6, i64 0
   4910   ret <4 x float> %7
   4911 }
   4912 
   4913 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
   4914 
   4915 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   4916 ; X86-LABEL: test_mm_maskz_fmadd_ss:
   4917 ; X86:       # %bb.0: # %entry
   4918 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4919 ; X86-NEXT:    kmovw %eax, %k1
   4920 ; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4921 ; X86-NEXT:    retl
   4922 ;
   4923 ; X64-LABEL: test_mm_maskz_fmadd_ss:
   4924 ; X64:       # %bb.0: # %entry
   4925 ; X64-NEXT:    kmovw %edi, %k1
   4926 ; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   4927 ; X64-NEXT:    retq
   4928 entry:
   4929   %0 = extractelement <4 x float> %__A, i64 0
   4930   %1 = extractelement <4 x float> %__B, i64 0
   4931   %2 = extractelement <4 x float> %__C, i64 0
   4932   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   4933   %4 = and i8 %__U, 1
   4934   %tobool.i = icmp eq i8 %4, 0
   4935   %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
   4936   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   4937   ret <4 x float> %vecins.i
   4938 }
   4939 
   4940 define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   4941 ; X86-LABEL: test_mm_maskz_fmadd_round_ss:
   4942 ; X86:       # %bb.0: # %entry
   4943 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4944 ; X86-NEXT:    kmovw %eax, %k1
   4945 ; X86-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   4946 ; X86-NEXT:    retl
   4947 ;
   4948 ; X64-LABEL: test_mm_maskz_fmadd_round_ss:
   4949 ; X64:       # %bb.0: # %entry
   4950 ; X64-NEXT:    kmovw %edi, %k1
   4951 ; X64-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   4952 ; X64-NEXT:    retq
   4953 entry:
   4954   %0 = extractelement <4 x float> %__A, i64 0
   4955   %1 = extractelement <4 x float> %__B, i64 0
   4956   %2 = extractelement <4 x float> %__C, i64 0
   4957   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   4958   %4 = bitcast i8 %__U to <8 x i1>
   4959   %5 = extractelement <8 x i1> %4, i64 0
   4960   %6 = select i1 %5, float %3, float 0.000000e+00
   4961   %7 = insertelement <4 x float> %__A, float %6, i64 0
   4962   ret <4 x float> %7
   4963 }
   4964 
   4965 define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
   4966 ; X86-LABEL: test_mm_mask3_fmadd_ss:
   4967 ; X86:       # %bb.0: # %entry
   4968 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4969 ; X86-NEXT:    kmovw %eax, %k1
   4970 ; X86-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
   4971 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   4972 ; X86-NEXT:    retl
   4973 ;
   4974 ; X64-LABEL: test_mm_mask3_fmadd_ss:
   4975 ; X64:       # %bb.0: # %entry
   4976 ; X64-NEXT:    kmovw %edi, %k1
   4977 ; X64-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
   4978 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   4979 ; X64-NEXT:    retq
   4980 entry:
   4981   %0 = extractelement <4 x float> %__W, i64 0
   4982   %1 = extractelement <4 x float> %__X, i64 0
   4983   %2 = extractelement <4 x float> %__Y, i64 0
   4984   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   4985   %4 = and i8 %__U, 1
   4986   %tobool.i = icmp eq i8 %4, 0
   4987   %vecext1.i = extractelement <4 x float> %__Y, i32 0
   4988   %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
   4989   %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
   4990   ret <4 x float> %vecins.i
   4991 }
   4992 
   4993 define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
   4994 ; X86-LABEL: test_mm_mask3_fmadd_round_ss:
   4995 ; X86:       # %bb.0: # %entry
   4996 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   4997 ; X86-NEXT:    kmovw %eax, %k1
   4998 ; X86-NEXT:    vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   4999 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5000 ; X86-NEXT:    retl
   5001 ;
   5002 ; X64-LABEL: test_mm_mask3_fmadd_round_ss:
   5003 ; X64:       # %bb.0: # %entry
   5004 ; X64-NEXT:    kmovw %edi, %k1
   5005 ; X64-NEXT:    vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5006 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5007 ; X64-NEXT:    retq
   5008 entry:
   5009   %0 = extractelement <4 x float> %__W, i64 0
   5010   %1 = extractelement <4 x float> %__X, i64 0
   5011   %2 = extractelement <4 x float> %__Y, i64 0
   5012   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5013   %4 = bitcast i8 %__U to <8 x i1>
   5014   %5 = extractelement <8 x i1> %4, i64 0
   5015   %6 = select i1 %5, float %3, float %2
   5016   %7 = insertelement <4 x float> %__Y, float %6, i64 0
   5017   ret <4 x float> %7
   5018 }
   5019 
   5020 define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   5021 ; X86-LABEL: test_mm_mask_fmsub_ss:
   5022 ; X86:       # %bb.0: # %entry
   5023 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5024 ; X86-NEXT:    kmovw %eax, %k1
   5025 ; X86-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   5026 ; X86-NEXT:    retl
   5027 ;
   5028 ; X64-LABEL: test_mm_mask_fmsub_ss:
   5029 ; X64:       # %bb.0: # %entry
   5030 ; X64-NEXT:    kmovw %edi, %k1
   5031 ; X64-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   5032 ; X64-NEXT:    retq
   5033 entry:
   5034   %0 = extractelement <4 x float> %__W, i64 0
   5035   %1 = extractelement <4 x float> %__A, i64 0
   5036   %.rhs.i = extractelement <4 x float> %__B, i64 0
   5037   %2 = fsub float -0.000000e+00, %.rhs.i
   5038   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5039   %4 = and i8 %__U, 1
   5040   %tobool.i = icmp eq i8 %4, 0
   5041   %vecext1.i = extractelement <4 x float> %__W, i32 0
   5042   %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
   5043   %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
   5044   ret <4 x float> %vecins.i
   5045 }
   5046 
   5047 define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   5048 ; X86-LABEL: test_mm_mask_fmsub_round_ss:
   5049 ; X86:       # %bb.0: # %entry
   5050 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5051 ; X86-NEXT:    kmovw %eax, %k1
   5052 ; X86-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5053 ; X86-NEXT:    retl
   5054 ;
   5055 ; X64-LABEL: test_mm_mask_fmsub_round_ss:
   5056 ; X64:       # %bb.0: # %entry
   5057 ; X64-NEXT:    kmovw %edi, %k1
   5058 ; X64-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5059 ; X64-NEXT:    retq
   5060 entry:
   5061   %0 = extractelement <4 x float> %__W, i64 0
   5062   %1 = extractelement <4 x float> %__A, i64 0
   5063   %.rhs = extractelement <4 x float> %__B, i64 0
   5064   %2 = fsub float -0.000000e+00, %.rhs
   5065   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5066   %4 = bitcast i8 %__U to <8 x i1>
   5067   %5 = extractelement <8 x i1> %4, i64 0
   5068   %6 = select i1 %5, float %3, float %0
   5069   %7 = insertelement <4 x float> %__W, float %6, i64 0
   5070   ret <4 x float> %7
   5071 }
   5072 
   5073 define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   5074 ; X86-LABEL: test_mm_maskz_fmsub_ss:
   5075 ; X86:       # %bb.0: # %entry
   5076 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5077 ; X86-NEXT:    kmovw %eax, %k1
   5078 ; X86-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   5079 ; X86-NEXT:    retl
   5080 ;
   5081 ; X64-LABEL: test_mm_maskz_fmsub_ss:
   5082 ; X64:       # %bb.0: # %entry
   5083 ; X64-NEXT:    kmovw %edi, %k1
   5084 ; X64-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   5085 ; X64-NEXT:    retq
   5086 entry:
   5087   %0 = extractelement <4 x float> %__A, i64 0
   5088   %1 = extractelement <4 x float> %__B, i64 0
   5089   %.rhs.i = extractelement <4 x float> %__C, i64 0
   5090   %2 = fsub float -0.000000e+00, %.rhs.i
   5091   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5092   %4 = and i8 %__U, 1
   5093   %tobool.i = icmp eq i8 %4, 0
   5094   %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
   5095   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   5096   ret <4 x float> %vecins.i
   5097 }
   5098 
   5099 define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   5100 ; X86-LABEL: test_mm_maskz_fmsub_round_ss:
   5101 ; X86:       # %bb.0: # %entry
   5102 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5103 ; X86-NEXT:    kmovw %eax, %k1
   5104 ; X86-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5105 ; X86-NEXT:    retl
   5106 ;
   5107 ; X64-LABEL: test_mm_maskz_fmsub_round_ss:
   5108 ; X64:       # %bb.0: # %entry
   5109 ; X64-NEXT:    kmovw %edi, %k1
   5110 ; X64-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5111 ; X64-NEXT:    retq
   5112 entry:
   5113   %0 = extractelement <4 x float> %__A, i64 0
   5114   %1 = extractelement <4 x float> %__B, i64 0
   5115   %.rhs = extractelement <4 x float> %__C, i64 0
   5116   %2 = fsub float -0.000000e+00, %.rhs
   5117   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5118   %4 = bitcast i8 %__U to <8 x i1>
   5119   %5 = extractelement <8 x i1> %4, i64 0
   5120   %6 = select i1 %5, float %3, float 0.000000e+00
   5121   %7 = insertelement <4 x float> %__A, float %6, i64 0
   5122   ret <4 x float> %7
   5123 }
   5124 
   5125 define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
   5126 ; X86-LABEL: test_mm_mask3_fmsub_ss:
   5127 ; X86:       # %bb.0: # %entry
   5128 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5129 ; X86-NEXT:    kmovw %eax, %k1
   5130 ; X86-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
   5131 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5132 ; X86-NEXT:    retl
   5133 ;
   5134 ; X64-LABEL: test_mm_mask3_fmsub_ss:
   5135 ; X64:       # %bb.0: # %entry
   5136 ; X64-NEXT:    kmovw %edi, %k1
   5137 ; X64-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
   5138 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5139 ; X64-NEXT:    retq
   5140 entry:
   5141   %0 = extractelement <4 x float> %__W, i64 0
   5142   %1 = extractelement <4 x float> %__X, i64 0
   5143   %.rhs.i = extractelement <4 x float> %__Y, i64 0
   5144   %2 = fsub float -0.000000e+00, %.rhs.i
   5145   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5146   %4 = and i8 %__U, 1
   5147   %tobool.i = icmp eq i8 %4, 0
   5148   %vecext1.i = extractelement <4 x float> %__Y, i32 0
   5149   %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
   5150   %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
   5151   ret <4 x float> %vecins.i
   5152 }
   5153 
   5154 define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
   5155 ; X86-LABEL: test_mm_mask3_fmsub_round_ss:
   5156 ; X86:       # %bb.0: # %entry
   5157 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5158 ; X86-NEXT:    kmovw %eax, %k1
   5159 ; X86-NEXT:    vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5160 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5161 ; X86-NEXT:    retl
   5162 ;
   5163 ; X64-LABEL: test_mm_mask3_fmsub_round_ss:
   5164 ; X64:       # %bb.0: # %entry
   5165 ; X64-NEXT:    kmovw %edi, %k1
   5166 ; X64-NEXT:    vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5167 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5168 ; X64-NEXT:    retq
   5169 entry:
   5170   %0 = extractelement <4 x float> %__W, i64 0
   5171   %1 = extractelement <4 x float> %__X, i64 0
   5172   %.rhs = extractelement <4 x float> %__Y, i64 0
   5173   %2 = fsub float -0.000000e+00, %.rhs
   5174   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5175   %4 = bitcast i8 %__U to <8 x i1>
   5176   %5 = extractelement <8 x i1> %4, i64 0
   5177   %6 = select i1 %5, float %3, float %.rhs
   5178   %7 = insertelement <4 x float> %__Y, float %6, i64 0
   5179   ret <4 x float> %7
   5180 }
   5181 
   5182 define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   5183 ; X86-LABEL: test_mm_mask_fnmadd_ss:
   5184 ; X86:       # %bb.0: # %entry
   5185 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5186 ; X86-NEXT:    kmovw %eax, %k1
   5187 ; X86-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   5188 ; X86-NEXT:    retl
   5189 ;
   5190 ; X64-LABEL: test_mm_mask_fnmadd_ss:
   5191 ; X64:       # %bb.0: # %entry
   5192 ; X64-NEXT:    kmovw %edi, %k1
   5193 ; X64-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   5194 ; X64-NEXT:    retq
   5195 entry:
   5196   %0 = extractelement <4 x float> %__W, i64 0
   5197   %.rhs.i = extractelement <4 x float> %__A, i64 0
   5198   %1 = fsub float -0.000000e+00, %.rhs.i
   5199   %2 = extractelement <4 x float> %__B, i64 0
   5200   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5201   %4 = and i8 %__U, 1
   5202   %tobool.i = icmp eq i8 %4, 0
   5203   %vecext1.i = extractelement <4 x float> %__W, i32 0
   5204   %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
   5205   %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
   5206   ret <4 x float> %vecins.i
   5207 }
   5208 
   5209 define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   5210 ; X86-LABEL: test_mm_mask_fnmadd_round_ss:
   5211 ; X86:       # %bb.0: # %entry
   5212 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5213 ; X86-NEXT:    kmovw %eax, %k1
   5214 ; X86-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5215 ; X86-NEXT:    retl
   5216 ;
   5217 ; X64-LABEL: test_mm_mask_fnmadd_round_ss:
   5218 ; X64:       # %bb.0: # %entry
   5219 ; X64-NEXT:    kmovw %edi, %k1
   5220 ; X64-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5221 ; X64-NEXT:    retq
   5222 entry:
   5223   %0 = extractelement <4 x float> %__W, i64 0
   5224   %.rhs = extractelement <4 x float> %__A, i64 0
   5225   %1 = fsub float -0.000000e+00, %.rhs
   5226   %2 = extractelement <4 x float> %__B, i64 0
   5227   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5228   %4 = bitcast i8 %__U to <8 x i1>
   5229   %5 = extractelement <8 x i1> %4, i64 0
   5230   %6 = select i1 %5, float %3, float %0
   5231   %7 = insertelement <4 x float> %__W, float %6, i64 0
   5232   ret <4 x float> %7
   5233 }
   5234 
   5235 define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   5236 ; X86-LABEL: test_mm_maskz_fnmadd_ss:
   5237 ; X86:       # %bb.0: # %entry
   5238 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5239 ; X86-NEXT:    kmovw %eax, %k1
   5240 ; X86-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   5241 ; X86-NEXT:    retl
   5242 ;
   5243 ; X64-LABEL: test_mm_maskz_fnmadd_ss:
   5244 ; X64:       # %bb.0: # %entry
   5245 ; X64-NEXT:    kmovw %edi, %k1
   5246 ; X64-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   5247 ; X64-NEXT:    retq
   5248 entry:
   5249   %0 = extractelement <4 x float> %__A, i64 0
   5250   %.rhs.i = extractelement <4 x float> %__B, i64 0
   5251   %1 = fsub float -0.000000e+00, %.rhs.i
   5252   %2 = extractelement <4 x float> %__C, i64 0
   5253   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5254   %4 = and i8 %__U, 1
   5255   %tobool.i = icmp eq i8 %4, 0
   5256   %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
   5257   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   5258   ret <4 x float> %vecins.i
   5259 }
   5260 
   5261 define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   5262 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
   5263 ; X86:       # %bb.0: # %entry
   5264 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5265 ; X86-NEXT:    kmovw %eax, %k1
   5266 ; X86-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5267 ; X86-NEXT:    retl
   5268 ;
   5269 ; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
   5270 ; X64:       # %bb.0: # %entry
   5271 ; X64-NEXT:    kmovw %edi, %k1
   5272 ; X64-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5273 ; X64-NEXT:    retq
   5274 entry:
   5275   %0 = extractelement <4 x float> %__A, i64 0
   5276   %.rhs = extractelement <4 x float> %__B, i64 0
   5277   %1 = fsub float -0.000000e+00, %.rhs
   5278   %2 = extractelement <4 x float> %__C, i64 0
   5279   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5280   %4 = bitcast i8 %__U to <8 x i1>
   5281   %5 = extractelement <8 x i1> %4, i64 0
   5282   %6 = select i1 %5, float %3, float 0.000000e+00
   5283   %7 = insertelement <4 x float> %__A, float %6, i64 0
   5284   ret <4 x float> %7
   5285 }
   5286 
   5287 define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
   5288 ; X86-LABEL: test_mm_mask3_fnmadd_ss:
   5289 ; X86:       # %bb.0: # %entry
   5290 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5291 ; X86-NEXT:    kmovw %eax, %k1
   5292 ; X86-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
   5293 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5294 ; X86-NEXT:    retl
   5295 ;
   5296 ; X64-LABEL: test_mm_mask3_fnmadd_ss:
   5297 ; X64:       # %bb.0: # %entry
   5298 ; X64-NEXT:    kmovw %edi, %k1
   5299 ; X64-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
   5300 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5301 ; X64-NEXT:    retq
   5302 entry:
   5303   %0 = extractelement <4 x float> %__W, i64 0
   5304   %.rhs.i = extractelement <4 x float> %__X, i64 0
   5305   %1 = fsub float -0.000000e+00, %.rhs.i
   5306   %2 = extractelement <4 x float> %__Y, i64 0
   5307   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5308   %4 = and i8 %__U, 1
   5309   %tobool.i = icmp eq i8 %4, 0
   5310   %vecext1.i = extractelement <4 x float> %__Y, i32 0
   5311   %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
   5312   %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
   5313   ret <4 x float> %vecins.i
   5314 }
   5315 
   5316 define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
   5317 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
   5318 ; X86:       # %bb.0: # %entry
   5319 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5320 ; X86-NEXT:    kmovw %eax, %k1
   5321 ; X86-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5322 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5323 ; X86-NEXT:    retl
   5324 ;
   5325 ; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
   5326 ; X64:       # %bb.0: # %entry
   5327 ; X64-NEXT:    kmovw %edi, %k1
   5328 ; X64-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5329 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5330 ; X64-NEXT:    retq
   5331 entry:
   5332   %0 = extractelement <4 x float> %__W, i64 0
   5333   %.rhs = extractelement <4 x float> %__X, i64 0
   5334   %1 = fsub float -0.000000e+00, %.rhs
   5335   %2 = extractelement <4 x float> %__Y, i64 0
   5336   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5337   %4 = bitcast i8 %__U to <8 x i1>
   5338   %5 = extractelement <8 x i1> %4, i64 0
   5339   %6 = select i1 %5, float %3, float %2
   5340   %7 = insertelement <4 x float> %__Y, float %6, i64 0
   5341   ret <4 x float> %7
   5342 }
   5343 
   5344 define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   5345 ; X86-LABEL: test_mm_mask_fnmsub_ss:
   5346 ; X86:       # %bb.0: # %entry
   5347 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5348 ; X86-NEXT:    kmovw %eax, %k1
   5349 ; X86-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   5350 ; X86-NEXT:    retl
   5351 ;
   5352 ; X64-LABEL: test_mm_mask_fnmsub_ss:
   5353 ; X64:       # %bb.0: # %entry
   5354 ; X64-NEXT:    kmovw %edi, %k1
   5355 ; X64-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   5356 ; X64-NEXT:    retq
   5357 entry:
   5358   %0 = extractelement <4 x float> %__W, i64 0
   5359   %.rhs.i = extractelement <4 x float> %__A, i64 0
   5360   %1 = fsub float -0.000000e+00, %.rhs.i
   5361   %.rhs7.i = extractelement <4 x float> %__B, i64 0
   5362   %2 = fsub float -0.000000e+00, %.rhs7.i
   5363   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5364   %4 = and i8 %__U, 1
   5365   %tobool.i = icmp eq i8 %4, 0
   5366   %vecext2.i = extractelement <4 x float> %__W, i32 0
   5367   %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
   5368   %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
   5369   ret <4 x float> %vecins.i
   5370 }
   5371 
   5372 define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
   5373 ; X86-LABEL: test_mm_mask_fnmsub_round_ss:
   5374 ; X86:       # %bb.0: # %entry
   5375 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5376 ; X86-NEXT:    kmovw %eax, %k1
   5377 ; X86-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5378 ; X86-NEXT:    retl
   5379 ;
   5380 ; X64-LABEL: test_mm_mask_fnmsub_round_ss:
   5381 ; X64:       # %bb.0: # %entry
   5382 ; X64-NEXT:    kmovw %edi, %k1
   5383 ; X64-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5384 ; X64-NEXT:    retq
   5385 entry:
   5386   %0 = extractelement <4 x float> %__W, i64 0
   5387   %.rhs = extractelement <4 x float> %__A, i64 0
   5388   %1 = fsub float -0.000000e+00, %.rhs
   5389   %.rhs2 = extractelement <4 x float> %__B, i64 0
   5390   %2 = fsub float -0.000000e+00, %.rhs2
   5391   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5392   %4 = bitcast i8 %__U to <8 x i1>
   5393   %5 = extractelement <8 x i1> %4, i64 0
   5394   %6 = select i1 %5, float %3, float %0
   5395   %7 = insertelement <4 x float> %__W, float %6, i64 0
   5396   ret <4 x float> %7
   5397 }
   5398 
   5399 define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   5400 ; X86-LABEL: test_mm_maskz_fnmsub_ss:
   5401 ; X86:       # %bb.0: # %entry
   5402 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5403 ; X86-NEXT:    kmovw %eax, %k1
   5404 ; X86-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   5405 ; X86-NEXT:    retl
   5406 ;
   5407 ; X64-LABEL: test_mm_maskz_fnmsub_ss:
   5408 ; X64:       # %bb.0: # %entry
   5409 ; X64-NEXT:    kmovw %edi, %k1
   5410 ; X64-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   5411 ; X64-NEXT:    retq
   5412 entry:
   5413   %0 = extractelement <4 x float> %__A, i64 0
   5414   %.rhs.i = extractelement <4 x float> %__B, i64 0
   5415   %1 = fsub float -0.000000e+00, %.rhs.i
   5416   %.rhs5.i = extractelement <4 x float> %__C, i64 0
   5417   %2 = fsub float -0.000000e+00, %.rhs5.i
   5418   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5419   %4 = and i8 %__U, 1
   5420   %tobool.i = icmp eq i8 %4, 0
   5421   %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
   5422   %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
   5423   ret <4 x float> %vecins.i
   5424 }
   5425 
   5426 define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
   5427 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
   5428 ; X86:       # %bb.0: # %entry
   5429 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5430 ; X86-NEXT:    kmovw %eax, %k1
   5431 ; X86-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5432 ; X86-NEXT:    retl
   5433 ;
   5434 ; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
   5435 ; X64:       # %bb.0: # %entry
   5436 ; X64-NEXT:    kmovw %edi, %k1
   5437 ; X64-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5438 ; X64-NEXT:    retq
   5439 entry:
   5440   %0 = extractelement <4 x float> %__A, i64 0
   5441   %.rhs = extractelement <4 x float> %__B, i64 0
   5442   %1 = fsub float -0.000000e+00, %.rhs
   5443   %.rhs2 = extractelement <4 x float> %__C, i64 0
   5444   %2 = fsub float -0.000000e+00, %.rhs2
   5445   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5446   %4 = bitcast i8 %__U to <8 x i1>
   5447   %5 = extractelement <8 x i1> %4, i64 0
   5448   %6 = select i1 %5, float %3, float 0.000000e+00
   5449   %7 = insertelement <4 x float> %__A, float %6, i64 0
   5450   ret <4 x float> %7
   5451 }
   5452 
   5453 define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
   5454 ; X86-LABEL: test_mm_mask3_fnmsub_ss:
   5455 ; X86:       # %bb.0: # %entry
   5456 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5457 ; X86-NEXT:    kmovw %eax, %k1
   5458 ; X86-NEXT:    vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
   5459 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5460 ; X86-NEXT:    retl
   5461 ;
   5462 ; X64-LABEL: test_mm_mask3_fnmsub_ss:
   5463 ; X64:       # %bb.0: # %entry
   5464 ; X64-NEXT:    kmovw %edi, %k1
   5465 ; X64-NEXT:    vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
   5466 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5467 ; X64-NEXT:    retq
   5468 entry:
   5469   %0 = extractelement <4 x float> %__W, i64 0
   5470   %.rhs.i = extractelement <4 x float> %__X, i64 0
   5471   %1 = fsub float -0.000000e+00, %.rhs.i
   5472   %.rhs7.i = extractelement <4 x float> %__Y, i64 0
   5473   %2 = fsub float -0.000000e+00, %.rhs7.i
   5474   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
   5475   %4 = and i8 %__U, 1
   5476   %tobool.i = icmp eq i8 %4, 0
   5477   %vecext2.i = extractelement <4 x float> %__Y, i32 0
   5478   %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
   5479   %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
   5480   ret <4 x float> %vecins.i
   5481 }
   5482 
   5483 define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
   5484 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
   5485 ; X86:       # %bb.0: # %entry
   5486 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5487 ; X86-NEXT:    kmovw %eax, %k1
   5488 ; X86-NEXT:    vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5489 ; X86-NEXT:    vmovaps %xmm2, %xmm0
   5490 ; X86-NEXT:    retl
   5491 ;
   5492 ; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
   5493 ; X64:       # %bb.0: # %entry
   5494 ; X64-NEXT:    kmovw %edi, %k1
   5495 ; X64-NEXT:    vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5496 ; X64-NEXT:    vmovaps %xmm2, %xmm0
   5497 ; X64-NEXT:    retq
   5498 entry:
   5499   %0 = extractelement <4 x float> %__W, i64 0
   5500   %.rhs = extractelement <4 x float> %__X, i64 0
   5501   %1 = fsub float -0.000000e+00, %.rhs
   5502   %.rhs1 = extractelement <4 x float> %__Y, i64 0
   5503   %2 = fsub float -0.000000e+00, %.rhs1
   5504   %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
   5505   %4 = bitcast i8 %__U to <8 x i1>
   5506   %5 = extractelement <8 x i1> %4, i64 0
   5507   %6 = select i1 %5, float %3, float %.rhs1
   5508   %7 = insertelement <4 x float> %__Y, float %6, i64 0
   5509   ret <4 x float> %7
   5510 }
   5511 
   5512 define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   5513 ; X86-LABEL: test_mm_mask_fmadd_sd:
   5514 ; X86:       # %bb.0: # %entry
   5515 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5516 ; X86-NEXT:    kmovw %eax, %k1
   5517 ; X86-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   5518 ; X86-NEXT:    retl
   5519 ;
   5520 ; X64-LABEL: test_mm_mask_fmadd_sd:
   5521 ; X64:       # %bb.0: # %entry
   5522 ; X64-NEXT:    kmovw %edi, %k1
   5523 ; X64-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   5524 ; X64-NEXT:    retq
   5525 entry:
   5526   %0 = extractelement <2 x double> %__W, i64 0
   5527   %1 = extractelement <2 x double> %__A, i64 0
   5528   %2 = extractelement <2 x double> %__B, i64 0
   5529   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5530   %4 = and i8 %__U, 1
   5531   %tobool.i = icmp eq i8 %4, 0
   5532   %vecext1.i = extractelement <2 x double> %__W, i32 0
   5533   %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
   5534   %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
   5535   ret <2 x double> %vecins.i
   5536 }
   5537 
   5538 define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   5539 ; X86-LABEL: test_mm_mask_fmadd_round_sd:
   5540 ; X86:       # %bb.0: # %entry
   5541 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5542 ; X86-NEXT:    kmovw %eax, %k1
   5543 ; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5544 ; X86-NEXT:    retl
   5545 ;
   5546 ; X64-LABEL: test_mm_mask_fmadd_round_sd:
   5547 ; X64:       # %bb.0: # %entry
   5548 ; X64-NEXT:    kmovw %edi, %k1
   5549 ; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5550 ; X64-NEXT:    retq
   5551 entry:
   5552   %0 = extractelement <2 x double> %__W, i64 0
   5553   %1 = extractelement <2 x double> %__A, i64 0
   5554   %2 = extractelement <2 x double> %__B, i64 0
   5555   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   5556   %4 = bitcast i8 %__U to <8 x i1>
   5557   %5 = extractelement <8 x i1> %4, i64 0
   5558   %6 = select i1 %5, double %3, double %0
   5559   %7 = insertelement <2 x double> %__W, double %6, i64 0
   5560   ret <2 x double> %7
   5561 }
   5562 
   5563 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
   5564 
   5565 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   5566 ; X86-LABEL: test_mm_maskz_fmadd_sd:
   5567 ; X86:       # %bb.0: # %entry
   5568 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5569 ; X86-NEXT:    kmovw %eax, %k1
   5570 ; X86-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   5571 ; X86-NEXT:    retl
   5572 ;
   5573 ; X64-LABEL: test_mm_maskz_fmadd_sd:
   5574 ; X64:       # %bb.0: # %entry
   5575 ; X64-NEXT:    kmovw %edi, %k1
   5576 ; X64-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
   5577 ; X64-NEXT:    retq
   5578 entry:
   5579   %0 = extractelement <2 x double> %__A, i64 0
   5580   %1 = extractelement <2 x double> %__B, i64 0
   5581   %2 = extractelement <2 x double> %__C, i64 0
   5582   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5583   %4 = and i8 %__U, 1
   5584   %tobool.i = icmp eq i8 %4, 0
   5585   %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
   5586   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   5587   ret <2 x double> %vecins.i
   5588 }
   5589 
   5590 define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   5591 ; X86-LABEL: test_mm_maskz_fmadd_round_sd:
   5592 ; X86:       # %bb.0: # %entry
   5593 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5594 ; X86-NEXT:    kmovw %eax, %k1
   5595 ; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5596 ; X86-NEXT:    retl
   5597 ;
   5598 ; X64-LABEL: test_mm_maskz_fmadd_round_sd:
   5599 ; X64:       # %bb.0: # %entry
   5600 ; X64-NEXT:    kmovw %edi, %k1
   5601 ; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5602 ; X64-NEXT:    retq
   5603 entry:
   5604   %0 = extractelement <2 x double> %__A, i64 0
   5605   %1 = extractelement <2 x double> %__B, i64 0
   5606   %2 = extractelement <2 x double> %__C, i64 0
   5607   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   5608   %4 = bitcast i8 %__U to <8 x i1>
   5609   %5 = extractelement <8 x i1> %4, i64 0
   5610   %6 = select i1 %5, double %3, double 0.000000e+00
   5611   %7 = insertelement <2 x double> %__A, double %6, i64 0
   5612   ret <2 x double> %7
   5613 }
   5614 
   5615 define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
   5616 ; X86-LABEL: test_mm_mask3_fmadd_sd:
   5617 ; X86:       # %bb.0: # %entry
   5618 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5619 ; X86-NEXT:    kmovw %eax, %k1
   5620 ; X86-NEXT:    vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
   5621 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5622 ; X86-NEXT:    retl
   5623 ;
   5624 ; X64-LABEL: test_mm_mask3_fmadd_sd:
   5625 ; X64:       # %bb.0: # %entry
   5626 ; X64-NEXT:    kmovw %edi, %k1
   5627 ; X64-NEXT:    vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
   5628 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5629 ; X64-NEXT:    retq
   5630 entry:
   5631   %0 = extractelement <2 x double> %__W, i64 0
   5632   %1 = extractelement <2 x double> %__X, i64 0
   5633   %2 = extractelement <2 x double> %__Y, i64 0
   5634   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5635   %4 = and i8 %__U, 1
   5636   %tobool.i = icmp eq i8 %4, 0
   5637   %vecext1.i = extractelement <2 x double> %__Y, i32 0
   5638   %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
   5639   %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
   5640   ret <2 x double> %vecins.i
   5641 }
   5642 
   5643 define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
   5644 ; X86-LABEL: test_mm_mask3_fmadd_round_sd:
   5645 ; X86:       # %bb.0: # %entry
   5646 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5647 ; X86-NEXT:    kmovw %eax, %k1
   5648 ; X86-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5649 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5650 ; X86-NEXT:    retl
   5651 ;
   5652 ; X64-LABEL: test_mm_mask3_fmadd_round_sd:
   5653 ; X64:       # %bb.0: # %entry
   5654 ; X64-NEXT:    kmovw %edi, %k1
   5655 ; X64-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5656 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5657 ; X64-NEXT:    retq
   5658 entry:
   5659   %0 = extractelement <2 x double> %__W, i64 0
   5660   %1 = extractelement <2 x double> %__X, i64 0
   5661   %2 = extractelement <2 x double> %__Y, i64 0
   5662   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   5663   %4 = bitcast i8 %__U to <8 x i1>
   5664   %5 = extractelement <8 x i1> %4, i64 0
   5665   %6 = select i1 %5, double %3, double %2
   5666   %7 = insertelement <2 x double> %__Y, double %6, i64 0
   5667   ret <2 x double> %7
   5668 }
   5669 
   5670 define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   5671 ; X86-LABEL: test_mm_mask_fmsub_sd:
   5672 ; X86:       # %bb.0: # %entry
   5673 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5674 ; X86-NEXT:    kmovw %eax, %k1
   5675 ; X86-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   5676 ; X86-NEXT:    retl
   5677 ;
   5678 ; X64-LABEL: test_mm_mask_fmsub_sd:
   5679 ; X64:       # %bb.0: # %entry
   5680 ; X64-NEXT:    kmovw %edi, %k1
   5681 ; X64-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   5682 ; X64-NEXT:    retq
   5683 entry:
   5684   %0 = extractelement <2 x double> %__W, i64 0
   5685   %1 = extractelement <2 x double> %__A, i64 0
   5686   %.rhs.i = extractelement <2 x double> %__B, i64 0
   5687   %2 = fsub double -0.000000e+00, %.rhs.i
   5688   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5689   %4 = and i8 %__U, 1
   5690   %tobool.i = icmp eq i8 %4, 0
   5691   %vecext1.i = extractelement <2 x double> %__W, i32 0
   5692   %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
   5693   %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
   5694   ret <2 x double> %vecins.i
   5695 }
   5696 
   5697 define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   5698 ; X86-LABEL: test_mm_mask_fmsub_round_sd:
   5699 ; X86:       # %bb.0: # %entry
   5700 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5701 ; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm2, %xmm2
   5702 ; X86-NEXT:    kmovw %eax, %k1
   5703 ; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5704 ; X86-NEXT:    retl
   5705 ;
   5706 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
   5707 ; X64:       # %bb.0: # %entry
   5708 ; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm2, %xmm2
   5709 ; X64-NEXT:    kmovw %edi, %k1
   5710 ; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5711 ; X64-NEXT:    retq
   5712 entry:
   5713   %0 = extractelement <2 x double> %__W, i64 0
   5714   %1 = extractelement <2 x double> %__A, i64 0
   5715   %.rhs = extractelement <2 x double> %__B, i64 0
   5716   %2 = fsub double -0.000000e+00, %.rhs
   5717   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   5718   %4 = bitcast i8 %__U to <8 x i1>
   5719   %5 = extractelement <8 x i1> %4, i64 0
   5720   %6 = select i1 %5, double %3, double %0
   5721   %7 = insertelement <2 x double> %__W, double %6, i64 0
   5722   ret <2 x double> %7
   5723 }
   5724 
   5725 define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   5726 ; X86-LABEL: test_mm_maskz_fmsub_sd:
   5727 ; X86:       # %bb.0: # %entry
   5728 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5729 ; X86-NEXT:    kmovw %eax, %k1
   5730 ; X86-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   5731 ; X86-NEXT:    retl
   5732 ;
   5733 ; X64-LABEL: test_mm_maskz_fmsub_sd:
   5734 ; X64:       # %bb.0: # %entry
   5735 ; X64-NEXT:    kmovw %edi, %k1
   5736 ; X64-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
   5737 ; X64-NEXT:    retq
   5738 entry:
   5739   %0 = extractelement <2 x double> %__A, i64 0
   5740   %1 = extractelement <2 x double> %__B, i64 0
   5741   %.rhs.i = extractelement <2 x double> %__C, i64 0
   5742   %2 = fsub double -0.000000e+00, %.rhs.i
   5743   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5744   %4 = and i8 %__U, 1
   5745   %tobool.i = icmp eq i8 %4, 0
   5746   %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
   5747   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   5748   ret <2 x double> %vecins.i
   5749 }
   5750 
   5751 define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   5752 ; X86-LABEL: test_mm_maskz_fmsub_round_sd:
   5753 ; X86:       # %bb.0: # %entry
   5754 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5755 ; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm2, %xmm2
   5756 ; X86-NEXT:    kmovw %eax, %k1
   5757 ; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5758 ; X86-NEXT:    retl
   5759 ;
   5760 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
   5761 ; X64:       # %bb.0: # %entry
   5762 ; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm2, %xmm2
   5763 ; X64-NEXT:    kmovw %edi, %k1
   5764 ; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5765 ; X64-NEXT:    retq
   5766 entry:
   5767   %0 = extractelement <2 x double> %__A, i64 0
   5768   %1 = extractelement <2 x double> %__B, i64 0
   5769   %.rhs = extractelement <2 x double> %__C, i64 0
   5770   %2 = fsub double -0.000000e+00, %.rhs
   5771   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   5772   %4 = bitcast i8 %__U to <8 x i1>
   5773   %5 = extractelement <8 x i1> %4, i64 0
   5774   %6 = select i1 %5, double %3, double 0.000000e+00
   5775   %7 = insertelement <2 x double> %__A, double %6, i64 0
   5776   ret <2 x double> %7
   5777 }
   5778 
   5779 define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
   5780 ; X86-LABEL: test_mm_mask3_fmsub_sd:
   5781 ; X86:       # %bb.0: # %entry
   5782 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5783 ; X86-NEXT:    kmovw %eax, %k1
   5784 ; X86-NEXT:    vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
   5785 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5786 ; X86-NEXT:    retl
   5787 ;
   5788 ; X64-LABEL: test_mm_mask3_fmsub_sd:
   5789 ; X64:       # %bb.0: # %entry
   5790 ; X64-NEXT:    kmovw %edi, %k1
   5791 ; X64-NEXT:    vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
   5792 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5793 ; X64-NEXT:    retq
   5794 entry:
   5795   %0 = extractelement <2 x double> %__W, i64 0
   5796   %1 = extractelement <2 x double> %__X, i64 0
   5797   %.rhs.i = extractelement <2 x double> %__Y, i64 0
   5798   %2 = fsub double -0.000000e+00, %.rhs.i
   5799   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5800   %4 = and i8 %__U, 1
   5801   %tobool.i = icmp eq i8 %4, 0
   5802   %vecext1.i = extractelement <2 x double> %__Y, i32 0
   5803   %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
   5804   %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
   5805   ret <2 x double> %vecins.i
   5806 }
   5807 
   5808 define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
   5809 ; X86-LABEL: test_mm_mask3_fmsub_round_sd:
   5810 ; X86:       # %bb.0: # %entry
   5811 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5812 ; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm2, %xmm3
   5813 ; X86-NEXT:    vfmadd213sd %xmm3, %xmm0, %xmm1
   5814 ; X86-NEXT:    kmovw %eax, %k1
   5815 ; X86-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
   5816 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5817 ; X86-NEXT:    retl
   5818 ;
   5819 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
   5820 ; X64:       # %bb.0: # %entry
   5821 ; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm2, %xmm3
   5822 ; X64-NEXT:    vfmadd213sd %xmm3, %xmm0, %xmm1
   5823 ; X64-NEXT:    kmovw %edi, %k1
   5824 ; X64-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
   5825 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5826 ; X64-NEXT:    retq
   5827 entry:
   5828   %0 = extractelement <2 x double> %__W, i64 0
   5829   %1 = extractelement <2 x double> %__X, i64 0
   5830   %.rhs = extractelement <2 x double> %__Y, i64 0
   5831   %2 = fsub double -0.000000e+00, %.rhs
   5832   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   5833   %4 = bitcast i8 %__U to <8 x i1>
   5834   %5 = extractelement <8 x i1> %4, i64 0
   5835   %6 = select i1 %5, double %3, double %.rhs
   5836   %7 = insertelement <2 x double> %__Y, double %6, i64 0
   5837   ret <2 x double> %7
   5838 }
   5839 
   5840 define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   5841 ; X86-LABEL: test_mm_mask_fnmadd_sd:
   5842 ; X86:       # %bb.0: # %entry
   5843 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5844 ; X86-NEXT:    kmovw %eax, %k1
   5845 ; X86-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   5846 ; X86-NEXT:    retl
   5847 ;
   5848 ; X64-LABEL: test_mm_mask_fnmadd_sd:
   5849 ; X64:       # %bb.0: # %entry
   5850 ; X64-NEXT:    kmovw %edi, %k1
   5851 ; X64-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   5852 ; X64-NEXT:    retq
   5853 entry:
   5854   %0 = extractelement <2 x double> %__W, i64 0
   5855   %.rhs.i = extractelement <2 x double> %__A, i64 0
   5856   %1 = fsub double -0.000000e+00, %.rhs.i
   5857   %2 = extractelement <2 x double> %__B, i64 0
   5858   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5859   %4 = and i8 %__U, 1
   5860   %tobool.i = icmp eq i8 %4, 0
   5861   %vecext1.i = extractelement <2 x double> %__W, i32 0
   5862   %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
   5863   %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
   5864   ret <2 x double> %vecins.i
   5865 }
   5866 
   5867 define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   5868 ; X86-LABEL: test_mm_mask_fnmadd_round_sd:
   5869 ; X86:       # %bb.0: # %entry
   5870 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5871 ; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
   5872 ; X86-NEXT:    kmovw %eax, %k1
   5873 ; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5874 ; X86-NEXT:    retl
   5875 ;
   5876 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
   5877 ; X64:       # %bb.0: # %entry
   5878 ; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
   5879 ; X64-NEXT:    kmovw %edi, %k1
   5880 ; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   5881 ; X64-NEXT:    retq
   5882 entry:
   5883   %0 = extractelement <2 x double> %__W, i64 0
   5884   %.rhs = extractelement <2 x double> %__A, i64 0
   5885   %1 = fsub double -0.000000e+00, %.rhs
   5886   %2 = extractelement <2 x double> %__B, i64 0
   5887   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   5888   %4 = bitcast i8 %__U to <8 x i1>
   5889   %5 = extractelement <8 x i1> %4, i64 0
   5890   %6 = select i1 %5, double %3, double %0
   5891   %7 = insertelement <2 x double> %__W, double %6, i64 0
   5892   ret <2 x double> %7
   5893 }
   5894 
   5895 define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   5896 ; X86-LABEL: test_mm_maskz_fnmadd_sd:
   5897 ; X86:       # %bb.0: # %entry
   5898 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5899 ; X86-NEXT:    kmovw %eax, %k1
   5900 ; X86-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   5901 ; X86-NEXT:    retl
   5902 ;
   5903 ; X64-LABEL: test_mm_maskz_fnmadd_sd:
   5904 ; X64:       # %bb.0: # %entry
   5905 ; X64-NEXT:    kmovw %edi, %k1
   5906 ; X64-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
   5907 ; X64-NEXT:    retq
   5908 entry:
   5909   %0 = extractelement <2 x double> %__A, i64 0
   5910   %.rhs.i = extractelement <2 x double> %__B, i64 0
   5911   %1 = fsub double -0.000000e+00, %.rhs.i
   5912   %2 = extractelement <2 x double> %__C, i64 0
   5913   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5914   %4 = and i8 %__U, 1
   5915   %tobool.i = icmp eq i8 %4, 0
   5916   %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
   5917   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   5918   ret <2 x double> %vecins.i
   5919 }
   5920 
   5921 define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   5922 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
   5923 ; X86:       # %bb.0: # %entry
   5924 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5925 ; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
   5926 ; X86-NEXT:    kmovw %eax, %k1
   5927 ; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5928 ; X86-NEXT:    retl
   5929 ;
   5930 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
   5931 ; X64:       # %bb.0: # %entry
   5932 ; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
   5933 ; X64-NEXT:    kmovw %edi, %k1
   5934 ; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   5935 ; X64-NEXT:    retq
   5936 entry:
   5937   %0 = extractelement <2 x double> %__A, i64 0
   5938   %.rhs = extractelement <2 x double> %__B, i64 0
   5939   %1 = fsub double -0.000000e+00, %.rhs
   5940   %2 = extractelement <2 x double> %__C, i64 0
   5941   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   5942   %4 = bitcast i8 %__U to <8 x i1>
   5943   %5 = extractelement <8 x i1> %4, i64 0
   5944   %6 = select i1 %5, double %3, double 0.000000e+00
   5945   %7 = insertelement <2 x double> %__A, double %6, i64 0
   5946   ret <2 x double> %7
   5947 }
   5948 
   5949 define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
   5950 ; X86-LABEL: test_mm_mask3_fnmadd_sd:
   5951 ; X86:       # %bb.0: # %entry
   5952 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5953 ; X86-NEXT:    kmovw %eax, %k1
   5954 ; X86-NEXT:    vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
   5955 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5956 ; X86-NEXT:    retl
   5957 ;
   5958 ; X64-LABEL: test_mm_mask3_fnmadd_sd:
   5959 ; X64:       # %bb.0: # %entry
   5960 ; X64-NEXT:    kmovw %edi, %k1
   5961 ; X64-NEXT:    vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
   5962 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5963 ; X64-NEXT:    retq
   5964 entry:
   5965   %0 = extractelement <2 x double> %__W, i64 0
   5966   %.rhs.i = extractelement <2 x double> %__X, i64 0
   5967   %1 = fsub double -0.000000e+00, %.rhs.i
   5968   %2 = extractelement <2 x double> %__Y, i64 0
   5969   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   5970   %4 = and i8 %__U, 1
   5971   %tobool.i = icmp eq i8 %4, 0
   5972   %vecext1.i = extractelement <2 x double> %__Y, i32 0
   5973   %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
   5974   %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
   5975   ret <2 x double> %vecins.i
   5976 }
   5977 
   5978 define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
   5979 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
   5980 ; X86:       # %bb.0: # %entry
   5981 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   5982 ; X86-NEXT:    vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
   5983 ; X86-NEXT:    kmovw %eax, %k1
   5984 ; X86-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5985 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   5986 ; X86-NEXT:    retl
   5987 ;
   5988 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
   5989 ; X64:       # %bb.0: # %entry
   5990 ; X64-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
   5991 ; X64-NEXT:    kmovw %edi, %k1
   5992 ; X64-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   5993 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   5994 ; X64-NEXT:    retq
   5995 entry:
   5996   %0 = extractelement <2 x double> %__W, i64 0
   5997   %.rhs = extractelement <2 x double> %__X, i64 0
   5998   %1 = fsub double -0.000000e+00, %.rhs
   5999   %2 = extractelement <2 x double> %__Y, i64 0
   6000   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   6001   %4 = bitcast i8 %__U to <8 x i1>
   6002   %5 = extractelement <8 x i1> %4, i64 0
   6003   %6 = select i1 %5, double %3, double %2
   6004   %7 = insertelement <2 x double> %__Y, double %6, i64 0
   6005   ret <2 x double> %7
   6006 }
   6007 
   6008 define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   6009 ; X86-LABEL: test_mm_mask_fnmsub_sd:
   6010 ; X86:       # %bb.0: # %entry
   6011 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6012 ; X86-NEXT:    kmovw %eax, %k1
   6013 ; X86-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   6014 ; X86-NEXT:    retl
   6015 ;
   6016 ; X64-LABEL: test_mm_mask_fnmsub_sd:
   6017 ; X64:       # %bb.0: # %entry
   6018 ; X64-NEXT:    kmovw %edi, %k1
   6019 ; X64-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   6020 ; X64-NEXT:    retq
   6021 entry:
   6022   %0 = extractelement <2 x double> %__W, i64 0
   6023   %.rhs.i = extractelement <2 x double> %__A, i64 0
   6024   %1 = fsub double -0.000000e+00, %.rhs.i
   6025   %.rhs7.i = extractelement <2 x double> %__B, i64 0
   6026   %2 = fsub double -0.000000e+00, %.rhs7.i
   6027   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   6028   %4 = and i8 %__U, 1
   6029   %tobool.i = icmp eq i8 %4, 0
   6030   %vecext2.i = extractelement <2 x double> %__W, i32 0
   6031   %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
   6032   %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
   6033   ret <2 x double> %vecins.i
   6034 }
   6035 
   6036 define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
   6037 ; X86-LABEL: test_mm_mask_fnmsub_round_sd:
   6038 ; X86:       # %bb.0: # %entry
   6039 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6040 ; X86-NEXT:    kmovw %eax, %k1
   6041 ; X86-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   6042 ; X86-NEXT:    retl
   6043 ;
   6044 ; X64-LABEL: test_mm_mask_fnmsub_round_sd:
   6045 ; X64:       # %bb.0: # %entry
   6046 ; X64-NEXT:    kmovw %edi, %k1
   6047 ; X64-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
   6048 ; X64-NEXT:    retq
   6049 entry:
   6050   %0 = extractelement <2 x double> %__W, i64 0
   6051   %.rhs = extractelement <2 x double> %__A, i64 0
   6052   %1 = fsub double -0.000000e+00, %.rhs
   6053   %.rhs2 = extractelement <2 x double> %__B, i64 0
   6054   %2 = fsub double -0.000000e+00, %.rhs2
   6055   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   6056   %4 = bitcast i8 %__U to <8 x i1>
   6057   %5 = extractelement <8 x i1> %4, i64 0
   6058   %6 = select i1 %5, double %3, double %0
   6059   %7 = insertelement <2 x double> %__W, double %6, i64 0
   6060   ret <2 x double> %7
   6061 }
   6062 
   6063 define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   6064 ; X86-LABEL: test_mm_maskz_fnmsub_sd:
   6065 ; X86:       # %bb.0: # %entry
   6066 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6067 ; X86-NEXT:    kmovw %eax, %k1
   6068 ; X86-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   6069 ; X86-NEXT:    retl
   6070 ;
   6071 ; X64-LABEL: test_mm_maskz_fnmsub_sd:
   6072 ; X64:       # %bb.0: # %entry
   6073 ; X64-NEXT:    kmovw %edi, %k1
   6074 ; X64-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
   6075 ; X64-NEXT:    retq
   6076 entry:
   6077   %0 = extractelement <2 x double> %__A, i64 0
   6078   %.rhs.i = extractelement <2 x double> %__B, i64 0
   6079   %1 = fsub double -0.000000e+00, %.rhs.i
   6080   %.rhs5.i = extractelement <2 x double> %__C, i64 0
   6081   %2 = fsub double -0.000000e+00, %.rhs5.i
   6082   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   6083   %4 = and i8 %__U, 1
   6084   %tobool.i = icmp eq i8 %4, 0
   6085   %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
   6086   %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
   6087   ret <2 x double> %vecins.i
   6088 }
   6089 
   6090 define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
   6091 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
   6092 ; X86:       # %bb.0: # %entry
   6093 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6094 ; X86-NEXT:    kmovw %eax, %k1
   6095 ; X86-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   6096 ; X86-NEXT:    retl
   6097 ;
   6098 ; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
   6099 ; X64:       # %bb.0: # %entry
   6100 ; X64-NEXT:    kmovw %edi, %k1
   6101 ; X64-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
   6102 ; X64-NEXT:    retq
   6103 entry:
   6104   %0 = extractelement <2 x double> %__A, i64 0
   6105   %.rhs = extractelement <2 x double> %__B, i64 0
   6106   %1 = fsub double -0.000000e+00, %.rhs
   6107   %.rhs2 = extractelement <2 x double> %__C, i64 0
   6108   %2 = fsub double -0.000000e+00, %.rhs2
   6109   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   6110   %4 = bitcast i8 %__U to <8 x i1>
   6111   %5 = extractelement <8 x i1> %4, i64 0
   6112   %6 = select i1 %5, double %3, double 0.000000e+00
   6113   %7 = insertelement <2 x double> %__A, double %6, i64 0
   6114   ret <2 x double> %7
   6115 }
   6116 
   6117 define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
   6118 ; X86-LABEL: test_mm_mask3_fnmsub_sd:
   6119 ; X86:       # %bb.0: # %entry
   6120 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6121 ; X86-NEXT:    kmovw %eax, %k1
   6122 ; X86-NEXT:    vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
   6123 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   6124 ; X86-NEXT:    retl
   6125 ;
   6126 ; X64-LABEL: test_mm_mask3_fnmsub_sd:
   6127 ; X64:       # %bb.0: # %entry
   6128 ; X64-NEXT:    kmovw %edi, %k1
   6129 ; X64-NEXT:    vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
   6130 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   6131 ; X64-NEXT:    retq
   6132 entry:
   6133   %0 = extractelement <2 x double> %__W, i64 0
   6134   %.rhs.i = extractelement <2 x double> %__X, i64 0
   6135   %1 = fsub double -0.000000e+00, %.rhs.i
   6136   %.rhs7.i = extractelement <2 x double> %__Y, i64 0
   6137   %2 = fsub double -0.000000e+00, %.rhs7.i
   6138   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
   6139   %4 = and i8 %__U, 1
   6140   %tobool.i = icmp eq i8 %4, 0
   6141   %vecext2.i = extractelement <2 x double> %__Y, i32 0
   6142   %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
   6143   %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
   6144   ret <2 x double> %vecins.i
   6145 }
   6146 
   6147 define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
   6148 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
   6149 ; X86:       # %bb.0: # %entry
   6150 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6151 ; X86-NEXT:    kmovw %eax, %k1
   6152 ; X86-NEXT:    vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   6153 ; X86-NEXT:    vmovapd %xmm2, %xmm0
   6154 ; X86-NEXT:    retl
   6155 ;
   6156 ; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
   6157 ; X64:       # %bb.0: # %entry
   6158 ; X64-NEXT:    kmovw %edi, %k1
   6159 ; X64-NEXT:    vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
   6160 ; X64-NEXT:    vmovapd %xmm2, %xmm0
   6161 ; X64-NEXT:    retq
   6162 entry:
   6163   %0 = extractelement <2 x double> %__W, i64 0
   6164   %.rhs = extractelement <2 x double> %__X, i64 0
   6165   %1 = fsub double -0.000000e+00, %.rhs
   6166   %.rhs1 = extractelement <2 x double> %__Y, i64 0
   6167   %2 = fsub double -0.000000e+00, %.rhs1
   6168   %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
   6169   %4 = bitcast i8 %__U to <8 x i1>
   6170   %5 = extractelement <8 x i1> %4, i64 0
   6171   %6 = select i1 %5, double %3, double %.rhs1
   6172   %7 = insertelement <2 x double> %__Y, double %6, i64 0
   6173   ret <2 x double> %7
   6174 }
   6175 
   6176 define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
   6177 ; X86-LABEL: test_mm512_mask_expandloadu_epi64:
   6178 ; X86:       # %bb.0: # %entry
   6179 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6180 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6181 ; X86-NEXT:    kmovw %ecx, %k1
   6182 ; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1}
   6183 ; X86-NEXT:    retl
   6184 ;
   6185 ; X64-LABEL: test_mm512_mask_expandloadu_epi64:
   6186 ; X64:       # %bb.0: # %entry
   6187 ; X64-NEXT:    kmovw %edi, %k1
   6188 ; X64-NEXT:    vpexpandq (%rsi), %zmm0 {%k1}
   6189 ; X64-NEXT:    retq
   6190 entry:
   6191   %0 = bitcast i8* %__P to i64*
   6192   %1 = bitcast i8 %__U to <8 x i1>
   6193   %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
   6194   ret <8 x i64> %2
   6195 }
   6196 
   6197 define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
   6198 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
   6199 ; X86:       # %bb.0: # %entry
   6200 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6201 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6202 ; X86-NEXT:    kmovw %ecx, %k1
   6203 ; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1} {z}
   6204 ; X86-NEXT:    retl
   6205 ;
   6206 ; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
   6207 ; X64:       # %bb.0: # %entry
   6208 ; X64-NEXT:    kmovw %edi, %k1
   6209 ; X64-NEXT:    vpexpandq (%rsi), %zmm0 {%k1} {z}
   6210 ; X64-NEXT:    retq
   6211 entry:
   6212   %0 = bitcast i8* %__P to i64*
   6213   %1 = bitcast i8 %__U to <8 x i1>
   6214   %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
   6215   ret <8 x i64> %2
   6216 }
   6217 
   6218 define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
   6219 ; X86-LABEL: test_mm512_mask_expandloadu_pd:
   6220 ; X86:       # %bb.0: # %entry
   6221 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6222 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6223 ; X86-NEXT:    kmovw %ecx, %k1
   6224 ; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1}
   6225 ; X86-NEXT:    retl
   6226 ;
   6227 ; X64-LABEL: test_mm512_mask_expandloadu_pd:
   6228 ; X64:       # %bb.0: # %entry
   6229 ; X64-NEXT:    kmovw %edi, %k1
   6230 ; X64-NEXT:    vexpandpd (%rsi), %zmm0 {%k1}
   6231 ; X64-NEXT:    retq
   6232 entry:
   6233   %0 = bitcast i8* %__P to double*
   6234   %1 = bitcast i8 %__U to <8 x i1>
   6235   %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
   6236   ret <8 x double> %2
   6237 }
   6238 
   6239 define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
   6240 ; X86-LABEL: test_mm512_maskz_expandloadu_pd:
   6241 ; X86:       # %bb.0: # %entry
   6242 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6243 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
   6244 ; X86-NEXT:    kmovw %ecx, %k1
   6245 ; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1} {z}
   6246 ; X86-NEXT:    retl
   6247 ;
   6248 ; X64-LABEL: test_mm512_maskz_expandloadu_pd:
   6249 ; X64:       # %bb.0: # %entry
   6250 ; X64-NEXT:    kmovw %edi, %k1
   6251 ; X64-NEXT:    vexpandpd (%rsi), %zmm0 {%k1} {z}
   6252 ; X64-NEXT:    retq
   6253 entry:
   6254   %0 = bitcast i8* %__P to double*
   6255   %1 = bitcast i8 %__U to <8 x i1>
   6256   %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
   6257   ret <8 x double> %2
   6258 }
   6259 
   6260 define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
   6261 ; X86-LABEL: test_mm512_mask_expandloadu_epi32:
   6262 ; X86:       # %bb.0: # %entry
   6263 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6264 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   6265 ; X86-NEXT:    vpexpandd (%eax), %zmm0 {%k1}
   6266 ; X86-NEXT:    retl
   6267 ;
   6268 ; X64-LABEL: test_mm512_mask_expandloadu_epi32:
   6269 ; X64:       # %bb.0: # %entry
   6270 ; X64-NEXT:    kmovw %edi, %k1
   6271 ; X64-NEXT:    vpexpandd (%rsi), %zmm0 {%k1}
   6272 ; X64-NEXT:    retq
   6273 entry:
   6274   %0 = bitcast <8 x i64> %__W to <16 x i32>
   6275   %1 = bitcast i8* %__P to i32*
   6276   %2 = bitcast i16 %__U to <16 x i1>
   6277   %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
   6278   %4 = bitcast <16 x i32> %3 to <8 x i64>
   6279   ret <8 x i64> %4
   6280 }
   6281 
   6282 define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
   6283 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
   6284 ; X86:       # %bb.0: # %entry
   6285 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6286 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   6287 ; X86-NEXT:    vpexpandd (%eax), %zmm0 {%k1} {z}
   6288 ; X86-NEXT:    retl
   6289 ;
   6290 ; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
   6291 ; X64:       # %bb.0: # %entry
   6292 ; X64-NEXT:    kmovw %edi, %k1
   6293 ; X64-NEXT:    vpexpandd (%rsi), %zmm0 {%k1} {z}
   6294 ; X64-NEXT:    retq
   6295 entry:
   6296   %0 = bitcast i8* %__P to i32*
   6297   %1 = bitcast i16 %__U to <16 x i1>
   6298   %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
   6299   %3 = bitcast <16 x i32> %2 to <8 x i64>
   6300   ret <8 x i64> %3
   6301 }
   6302 
   6303 define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
   6304 ; X86-LABEL: test_mm512_mask_expandloadu_ps:
   6305 ; X86:       # %bb.0: # %entry
   6306 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6307 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   6308 ; X86-NEXT:    vexpandps (%eax), %zmm0 {%k1}
   6309 ; X86-NEXT:    retl
   6310 ;
   6311 ; X64-LABEL: test_mm512_mask_expandloadu_ps:
   6312 ; X64:       # %bb.0: # %entry
   6313 ; X64-NEXT:    kmovw %edi, %k1
   6314 ; X64-NEXT:    vexpandps (%rsi), %zmm0 {%k1}
   6315 ; X64-NEXT:    retq
   6316 entry:
   6317   %0 = bitcast i8* %__P to float*
   6318   %1 = bitcast i16 %__U to <16 x i1>
   6319   %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
   6320   ret <16 x float> %2
   6321 }
   6322 
   6323 define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
   6324 ; X86-LABEL: test_mm512_maskz_expandloadu_ps:
   6325 ; X86:       # %bb.0: # %entry
   6326 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6327 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   6328 ; X86-NEXT:    vexpandps (%eax), %zmm0 {%k1} {z}
   6329 ; X86-NEXT:    retl
   6330 ;
   6331 ; X64-LABEL: test_mm512_maskz_expandloadu_ps:
   6332 ; X64:       # %bb.0: # %entry
   6333 ; X64-NEXT:    kmovw %edi, %k1
   6334 ; X64-NEXT:    vexpandps (%rsi), %zmm0 {%k1} {z}
   6335 ; X64-NEXT:    retq
   6336 entry:
   6337   %0 = bitcast i8* %__P to float*
   6338   %1 = bitcast i16 %__U to <16 x i1>
   6339   %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
   6340   ret <16 x float> %2
   6341 }
   6342 
   6343 define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
   6344 ; X86-LABEL: test_mm512_mask_compressstoreu_pd:
   6345 ; X86:       # %bb.0: # %entry
   6346 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6347 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6348 ; X86-NEXT:    kmovw %eax, %k1
   6349 ; X86-NEXT:    vcompresspd %zmm0, (%ecx) {%k1}
   6350 ; X86-NEXT:    vzeroupper
   6351 ; X86-NEXT:    retl
   6352 ;
   6353 ; X64-LABEL: test_mm512_mask_compressstoreu_pd:
   6354 ; X64:       # %bb.0: # %entry
   6355 ; X64-NEXT:    kmovw %esi, %k1
   6356 ; X64-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
   6357 ; X64-NEXT:    vzeroupper
   6358 ; X64-NEXT:    retq
   6359 entry:
   6360   %0 = bitcast i8* %__P to double*
   6361   %1 = bitcast i8 %__U to <8 x i1>
   6362   tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
   6363   ret void
   6364 }
   6365 
   6366 define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
   6367 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
   6368 ; X86:       # %bb.0: # %entry
   6369 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6370 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   6371 ; X86-NEXT:    kmovw %eax, %k1
   6372 ; X86-NEXT:    vpcompressq %zmm0, (%ecx) {%k1}
   6373 ; X86-NEXT:    vzeroupper
   6374 ; X86-NEXT:    retl
   6375 ;
   6376 ; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
   6377 ; X64:       # %bb.0: # %entry
   6378 ; X64-NEXT:    kmovw %esi, %k1
   6379 ; X64-NEXT:    vpcompressq %zmm0, (%rdi) {%k1}
   6380 ; X64-NEXT:    vzeroupper
   6381 ; X64-NEXT:    retq
   6382 entry:
   6383   %0 = bitcast i8* %__P to i64*
   6384   %1 = bitcast i8 %__U to <8 x i1>
   6385   tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
   6386   ret void
   6387 }
   6388 
   6389 define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
   6390 ; X86-LABEL: test_mm512_mask_compressstoreu_ps:
   6391 ; X86:       # %bb.0: # %entry
   6392 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   6393 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6394 ; X86-NEXT:    vcompressps %zmm0, (%eax) {%k1}
   6395 ; X86-NEXT:    vzeroupper
   6396 ; X86-NEXT:    retl
   6397 ;
   6398 ; X64-LABEL: test_mm512_mask_compressstoreu_ps:
   6399 ; X64:       # %bb.0: # %entry
   6400 ; X64-NEXT:    kmovw %esi, %k1
   6401 ; X64-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
   6402 ; X64-NEXT:    vzeroupper
   6403 ; X64-NEXT:    retq
   6404 entry:
   6405   %0 = bitcast i8* %__P to float*
   6406   %1 = bitcast i16 %__U to <16 x i1>
   6407   tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
   6408   ret void
   6409 }
   6410 
   6411 define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
   6412 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
   6413 ; X86:       # %bb.0: # %entry
   6414 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   6415 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   6416 ; X86-NEXT:    vpcompressd %zmm0, (%eax) {%k1}
   6417 ; X86-NEXT:    vzeroupper
   6418 ; X86-NEXT:    retl
   6419 ;
   6420 ; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
   6421 ; X64:       # %bb.0: # %entry
   6422 ; X64-NEXT:    kmovw %esi, %k1
   6423 ; X64-NEXT:    vpcompressd %zmm0, (%rdi) {%k1}
   6424 ; X64-NEXT:    vzeroupper
   6425 ; X64-NEXT:    retq
   6426 entry:
   6427   %0 = bitcast <8 x i64> %__A to <16 x i32>
   6428   %1 = bitcast i8* %__P to i32*
   6429   %2 = bitcast i16 %__U to <16 x i1>
   6430   tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
   6431   ret void
   6432 }
   6433 
   6434 define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
   6435 ; X86-LABEL: test_mm512_reduce_add_epi64:
   6436 ; X86:       # %bb.0: # %entry
   6437 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6438 ; X86-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   6439 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6440 ; X86-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
   6441 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6442 ; X86-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
   6443 ; X86-NEXT:    vmovd %xmm0, %eax
   6444 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   6445 ; X86-NEXT:    vzeroupper
   6446 ; X86-NEXT:    retl
   6447 ;
   6448 ; X64-LABEL: test_mm512_reduce_add_epi64:
   6449 ; X64:       # %bb.0: # %entry
   6450 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6451 ; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   6452 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6453 ; X64-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
   6454 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6455 ; X64-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
   6456 ; X64-NEXT:    vmovq %xmm0, %rax
   6457 ; X64-NEXT:    vzeroupper
   6458 ; X64-NEXT:    retq
   6459 entry:
   6460   %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6461   %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6462   %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
   6463   %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6464   %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6465   %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
   6466   %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   6467   %add7.i = add <2 x i64> %shuffle6.i, %add4.i
   6468   %vecext.i = extractelement <2 x i64> %add7.i, i32 0
   6469   ret i64 %vecext.i
   6470 }
   6471 
   6472 define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
   6473 ; X86-LABEL: test_mm512_reduce_mul_epi64:
   6474 ; X86:       # %bb.0: # %entry
   6475 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6476 ; X86-NEXT:    vpsrlq $32, %ymm0, %ymm2
   6477 ; X86-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
   6478 ; X86-NEXT:    vpsrlq $32, %ymm1, %ymm3
   6479 ; X86-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
   6480 ; X86-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
   6481 ; X86-NEXT:    vpsllq $32, %ymm2, %ymm2
   6482 ; X86-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
   6483 ; X86-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
   6484 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6485 ; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
   6486 ; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
   6487 ; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
   6488 ; X86-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
   6489 ; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
   6490 ; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
   6491 ; X86-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
   6492 ; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   6493 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6494 ; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
   6495 ; X86-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
   6496 ; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
   6497 ; X86-NEXT:    vpmuludq %xmm0, %xmm3, %xmm3
   6498 ; X86-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   6499 ; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
   6500 ; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
   6501 ; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   6502 ; X86-NEXT:    vmovd %xmm0, %eax
   6503 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   6504 ; X86-NEXT:    vzeroupper
   6505 ; X86-NEXT:    retl
   6506 ;
   6507 ; X64-LABEL: test_mm512_reduce_mul_epi64:
   6508 ; X64:       # %bb.0: # %entry
   6509 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6510 ; X64-NEXT:    vpsrlq $32, %ymm0, %ymm2
   6511 ; X64-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
   6512 ; X64-NEXT:    vpsrlq $32, %ymm1, %ymm3
   6513 ; X64-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
   6514 ; X64-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
   6515 ; X64-NEXT:    vpsllq $32, %ymm2, %ymm2
   6516 ; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
   6517 ; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
   6518 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6519 ; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
   6520 ; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
   6521 ; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
   6522 ; X64-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
   6523 ; X64-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
   6524 ; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
   6525 ; X64-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
   6526 ; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   6527 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6528 ; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
   6529 ; X64-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
   6530 ; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
   6531 ; X64-NEXT:    vpmuludq %xmm0, %xmm3, %xmm3
   6532 ; X64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   6533 ; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
   6534 ; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
   6535 ; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   6536 ; X64-NEXT:    vmovq %xmm0, %rax
   6537 ; X64-NEXT:    vzeroupper
   6538 ; X64-NEXT:    retq
   6539 entry:
   6540   %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6541   %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6542   %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
   6543   %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6544   %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6545   %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
   6546   %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   6547   %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
   6548   %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
   6549   ret i64 %vecext.i
   6550 }
   6551 
   6552 define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
   6553 ; X86-LABEL: test_mm512_reduce_or_epi64:
   6554 ; X86:       # %bb.0: # %entry
   6555 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6556 ; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
   6557 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6558 ; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
   6559 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6560 ; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
   6561 ; X86-NEXT:    vmovd %xmm0, %eax
   6562 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   6563 ; X86-NEXT:    vzeroupper
   6564 ; X86-NEXT:    retl
   6565 ;
   6566 ; X64-LABEL: test_mm512_reduce_or_epi64:
   6567 ; X64:       # %bb.0: # %entry
   6568 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6569 ; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
   6570 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6571 ; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
   6572 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6573 ; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
   6574 ; X64-NEXT:    vmovq %xmm0, %rax
   6575 ; X64-NEXT:    vzeroupper
   6576 ; X64-NEXT:    retq
   6577 entry:
   6578   %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6579   %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6580   %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
   6581   %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6582   %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6583   %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
   6584   %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   6585   %or7.i = or <2 x i64> %shuffle6.i, %or4.i
   6586   %vecext.i = extractelement <2 x i64> %or7.i, i32 0
   6587   ret i64 %vecext.i
   6588 }
   6589 
   6590 define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
   6591 ; X86-LABEL: test_mm512_reduce_and_epi64:
   6592 ; X86:       # %bb.0: # %entry
   6593 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6594 ; X86-NEXT:    vpand %ymm1, %ymm0, %ymm0
   6595 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6596 ; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
   6597 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6598 ; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
   6599 ; X86-NEXT:    vmovd %xmm0, %eax
   6600 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   6601 ; X86-NEXT:    vzeroupper
   6602 ; X86-NEXT:    retl
   6603 ;
   6604 ; X64-LABEL: test_mm512_reduce_and_epi64:
   6605 ; X64:       # %bb.0: # %entry
   6606 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6607 ; X64-NEXT:    vpand %ymm1, %ymm0, %ymm0
   6608 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6609 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
   6610 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6611 ; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
   6612 ; X64-NEXT:    vmovq %xmm0, %rax
   6613 ; X64-NEXT:    vzeroupper
   6614 ; X64-NEXT:    retq
   6615 entry:
   6616   %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6617   %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6618   %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
   6619   %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6620   %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6621   %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
   6622   %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   6623   %and7.i = and <2 x i64> %shuffle6.i, %and4.i
   6624   %vecext.i = extractelement <2 x i64> %and7.i, i32 0
   6625   ret i64 %vecext.i
   6626 }
   6627 
   6628 define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
   6629 ; X86-LABEL: test_mm512_mask_reduce_add_epi64:
   6630 ; X86:       # %bb.0: # %entry
   6631 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6632 ; X86-NEXT:    kmovw %eax, %k1
   6633 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
   6634 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6635 ; X86-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   6636 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6637 ; X86-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
   6638 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6639 ; X86-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
   6640 ; X86-NEXT:    vmovd %xmm0, %eax
   6641 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   6642 ; X86-NEXT:    vzeroupper
   6643 ; X86-NEXT:    retl
   6644 ;
   6645 ; X64-LABEL: test_mm512_mask_reduce_add_epi64:
   6646 ; X64:       # %bb.0: # %entry
   6647 ; X64-NEXT:    kmovw %edi, %k1
   6648 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
   6649 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6650 ; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   6651 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6652 ; X64-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
   6653 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6654 ; X64-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
   6655 ; X64-NEXT:    vmovq %xmm0, %rax
   6656 ; X64-NEXT:    vzeroupper
   6657 ; X64-NEXT:    retq
   6658 entry:
   6659   %0 = bitcast i8 %__M to <8 x i1>
   6660   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
   6661   %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6662   %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6663   %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
   6664   %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6665   %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6666   %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
   6667   %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   6668   %add7.i = add <2 x i64> %shuffle6.i, %add4.i
   6669   %vecext.i = extractelement <2 x i64> %add7.i, i32 0
   6670   ret i64 %vecext.i
   6671 }
   6672 
   6673 define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
   6674 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
   6675 ; X86:       # %bb.0: # %entry
   6676 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6677 ; X86-NEXT:    kmovw %eax, %k1
   6678 ; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
   6679 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   6680 ; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   6681 ; X86-NEXT:    vpsrlq $32, %ymm1, %ymm2
   6682 ; X86-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
   6683 ; X86-NEXT:    vpsrlq $32, %ymm0, %ymm3
   6684 ; X86-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
   6685 ; X86-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
   6686 ; X86-NEXT:    vpsllq $32, %ymm2, %ymm2
   6687 ; X86-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
   6688 ; X86-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
   6689 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6690 ; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
   6691 ; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
   6692 ; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
   6693 ; X86-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
   6694 ; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
   6695 ; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
   6696 ; X86-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
   6697 ; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   6698 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6699 ; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
   6700 ; X86-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
   6701 ; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
   6702 ; X86-NEXT:    vpmuludq %xmm0, %xmm3, %xmm3
   6703 ; X86-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   6704 ; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
   6705 ; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
   6706 ; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   6707 ; X86-NEXT:    vmovd %xmm0, %eax
   6708 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   6709 ; X86-NEXT:    vzeroupper
   6710 ; X86-NEXT:    retl
   6711 ;
   6712 ; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
   6713 ; X64:       # %bb.0: # %entry
   6714 ; X64-NEXT:    kmovw %edi, %k1
   6715 ; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
   6716 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   6717 ; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   6718 ; X64-NEXT:    vpsrlq $32, %ymm1, %ymm2
   6719 ; X64-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
   6720 ; X64-NEXT:    vpsrlq $32, %ymm0, %ymm3
   6721 ; X64-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
   6722 ; X64-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
   6723 ; X64-NEXT:    vpsllq $32, %ymm2, %ymm2
   6724 ; X64-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
   6725 ; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
   6726 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6727 ; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
   6728 ; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
   6729 ; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
   6730 ; X64-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
   6731 ; X64-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
   6732 ; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
   6733 ; X64-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
   6734 ; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   6735 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6736 ; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
   6737 ; X64-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
   6738 ; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
   6739 ; X64-NEXT:    vpmuludq %xmm0, %xmm3, %xmm3
   6740 ; X64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   6741 ; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
   6742 ; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
   6743 ; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   6744 ; X64-NEXT:    vmovq %xmm0, %rax
   6745 ; X64-NEXT:    vzeroupper
   6746 ; X64-NEXT:    retq
   6747 entry:
   6748   %0 = bitcast i8 %__M to <8 x i1>
   6749   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
   6750   %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6751   %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6752   %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
   6753   %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6754   %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6755   %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
   6756   %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   6757   %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
   6758   %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
   6759   ret i64 %vecext.i
   6760 }
   6761 
   6762 define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
   6763 ; X86-LABEL: test_mm512_mask_reduce_and_epi64:
   6764 ; X86:       # %bb.0: # %entry
   6765 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6766 ; X86-NEXT:    kmovw %eax, %k1
   6767 ; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
   6768 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   6769 ; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   6770 ; X86-NEXT:    vpand %ymm0, %ymm1, %ymm0
   6771 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6772 ; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
   6773 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6774 ; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
   6775 ; X86-NEXT:    vmovd %xmm0, %eax
   6776 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   6777 ; X86-NEXT:    vzeroupper
   6778 ; X86-NEXT:    retl
   6779 ;
   6780 ; X64-LABEL: test_mm512_mask_reduce_and_epi64:
   6781 ; X64:       # %bb.0: # %entry
   6782 ; X64-NEXT:    kmovw %edi, %k1
   6783 ; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
   6784 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   6785 ; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   6786 ; X64-NEXT:    vpand %ymm0, %ymm1, %ymm0
   6787 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6788 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
   6789 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6790 ; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
   6791 ; X64-NEXT:    vmovq %xmm0, %rax
   6792 ; X64-NEXT:    vzeroupper
   6793 ; X64-NEXT:    retq
   6794 entry:
   6795   %0 = bitcast i8 %__M to <8 x i1>
   6796   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
   6797   %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6798   %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6799   %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
   6800   %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6801   %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6802   %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
   6803   %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   6804   %and7.i = and <2 x i64> %shuffle6.i, %and4.i
   6805   %vecext.i = extractelement <2 x i64> %and7.i, i32 0
   6806   ret i64 %vecext.i
   6807 }
   6808 
   6809 define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
   6810 ; X86-LABEL: test_mm512_mask_reduce_or_epi64:
   6811 ; X86:       # %bb.0: # %entry
   6812 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   6813 ; X86-NEXT:    kmovw %eax, %k1
   6814 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
   6815 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6816 ; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
   6817 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6818 ; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
   6819 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6820 ; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
   6821 ; X86-NEXT:    vmovd %xmm0, %eax
   6822 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   6823 ; X86-NEXT:    vzeroupper
   6824 ; X86-NEXT:    retl
   6825 ;
   6826 ; X64-LABEL: test_mm512_mask_reduce_or_epi64:
   6827 ; X64:       # %bb.0: # %entry
   6828 ; X64-NEXT:    kmovw %edi, %k1
   6829 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
   6830 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6831 ; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
   6832 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6833 ; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
   6834 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6835 ; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
   6836 ; X64-NEXT:    vmovq %xmm0, %rax
   6837 ; X64-NEXT:    vzeroupper
   6838 ; X64-NEXT:    retq
   6839 entry:
   6840   %0 = bitcast i8 %__M to <8 x i1>
   6841   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
   6842   %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6843   %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6844   %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
   6845   %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6846   %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6847   %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
   6848   %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
   6849   %or7.i = or <2 x i64> %shuffle6.i, %or4.i
   6850   %vecext.i = extractelement <2 x i64> %or7.i, i32 0
   6851   ret i64 %vecext.i
   6852 }
   6853 
   6854 define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
   6855 ; CHECK-LABEL: test_mm512_reduce_add_epi32:
   6856 ; CHECK:       # %bb.0: # %entry
   6857 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6858 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   6859 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6860 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   6861 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6862 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
   6863 ; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
   6864 ; CHECK-NEXT:    vmovd %xmm0, %eax
   6865 ; CHECK-NEXT:    vzeroupper
   6866 ; CHECK-NEXT:    ret{{[l|q]}}
   6867 entry:
   6868   %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6869   %0 = bitcast <4 x i64> %extract.i to <8 x i32>
   6870   %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6871   %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
   6872   %add.i = add <8 x i32> %0, %1
   6873   %2 = bitcast <8 x i32> %add.i to <4 x i64>
   6874   %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6875   %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
   6876   %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6877   %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
   6878   %add5.i = add <4 x i32> %3, %4
   6879   %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   6880   %add6.i = add <4 x i32> %shuffle.i, %add5.i
   6881   %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   6882   %add8.i = add <4 x i32> %shuffle7.i, %add6.i
   6883   %vecext.i = extractelement <4 x i32> %add8.i, i32 0
   6884   ret i32 %vecext.i
   6885 }
   6886 
   6887 define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
   6888 ; CHECK-LABEL: test_mm512_reduce_mul_epi32:
   6889 ; CHECK:       # %bb.0: # %entry
   6890 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6891 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   6892 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6893 ; CHECK-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   6894 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6895 ; CHECK-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   6896 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   6897 ; CHECK-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   6898 ; CHECK-NEXT:    vmovd %xmm0, %eax
   6899 ; CHECK-NEXT:    vzeroupper
   6900 ; CHECK-NEXT:    ret{{[l|q]}}
   6901 entry:
   6902   %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6903   %0 = bitcast <4 x i64> %extract.i to <8 x i32>
   6904   %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6905   %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
   6906   %mul.i = mul <8 x i32> %0, %1
   6907   %2 = bitcast <8 x i32> %mul.i to <4 x i64>
   6908   %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6909   %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
   6910   %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6911   %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
   6912   %mul5.i = mul <4 x i32> %3, %4
   6913   %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   6914   %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
   6915   %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   6916   %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
   6917   %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
   6918   ret i32 %vecext.i
   6919 }
   6920 
   6921 define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
   6922 ; CHECK-LABEL: test_mm512_reduce_or_epi32:
   6923 ; CHECK:       # %bb.0: # %entry
   6924 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6925 ; CHECK-NEXT:    vpor %ymm1, %ymm0, %ymm0
   6926 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6927 ; CHECK-NEXT:    vpor %xmm1, %xmm0, %xmm0
   6928 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6929 ; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
   6930 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   6931 ; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
   6932 ; CHECK-NEXT:    vmovd %xmm0, %eax
   6933 ; CHECK-NEXT:    vzeroupper
   6934 ; CHECK-NEXT:    ret{{[l|q]}}
   6935 entry:
   6936   %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6937   %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6938   %or25.i = or <4 x i64> %extract.i, %extract2.i
   6939   %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6940   %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6941   %or526.i = or <2 x i64> %extract3.i, %extract4.i
   6942   %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
   6943   %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   6944   %or6.i = or <4 x i32> %shuffle.i, %or5.i
   6945   %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   6946   %or8.i = or <4 x i32> %shuffle7.i, %or6.i
   6947   %vecext.i = extractelement <4 x i32> %or8.i, i32 0
   6948   ret i32 %vecext.i
   6949 }
   6950 
   6951 define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
   6952 ; CHECK-LABEL: test_mm512_reduce_and_epi32:
   6953 ; CHECK:       # %bb.0: # %entry
   6954 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6955 ; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
   6956 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6957 ; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
   6958 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6959 ; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
   6960 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   6961 ; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
   6962 ; CHECK-NEXT:    vmovd %xmm0, %eax
   6963 ; CHECK-NEXT:    vzeroupper
   6964 ; CHECK-NEXT:    ret{{[l|q]}}
   6965 entry:
   6966   %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6967   %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6968   %and25.i = and <4 x i64> %extract.i, %extract2.i
   6969   %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   6970   %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   6971   %and526.i = and <2 x i64> %extract3.i, %extract4.i
   6972   %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
   6973   %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   6974   %and6.i = and <4 x i32> %shuffle.i, %and5.i
   6975   %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   6976   %and8.i = and <4 x i32> %shuffle7.i, %and6.i
   6977   %vecext.i = extractelement <4 x i32> %and8.i, i32 0
   6978   ret i32 %vecext.i
   6979 }
   6980 
   6981 define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
   6982 ; X86-LABEL: test_mm512_mask_reduce_add_epi32:
   6983 ; X86:       # %bb.0: # %entry
   6984 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   6985 ; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
   6986 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   6987 ; X86-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   6988 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   6989 ; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   6990 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   6991 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
   6992 ; X86-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
   6993 ; X86-NEXT:    vmovd %xmm0, %eax
   6994 ; X86-NEXT:    vzeroupper
   6995 ; X86-NEXT:    retl
   6996 ;
   6997 ; X64-LABEL: test_mm512_mask_reduce_add_epi32:
   6998 ; X64:       # %bb.0: # %entry
   6999 ; X64-NEXT:    kmovw %edi, %k1
   7000 ; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
   7001 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   7002 ; X64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   7003 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   7004 ; X64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   7005 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   7006 ; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
   7007 ; X64-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
   7008 ; X64-NEXT:    vmovd %xmm0, %eax
   7009 ; X64-NEXT:    vzeroupper
   7010 ; X64-NEXT:    retq
   7011 entry:
   7012   %0 = bitcast <8 x i64> %__W to <16 x i32>
   7013   %1 = bitcast i16 %__M to <16 x i1>
   7014   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
   7015   %3 = bitcast <16 x i32> %2 to <8 x i64>
   7016   %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7017   %4 = bitcast <4 x i64> %extract.i to <8 x i32>
   7018   %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7019   %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
   7020   %add.i = add <8 x i32> %4, %5
   7021   %6 = bitcast <8 x i32> %add.i to <4 x i64>
   7022   %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   7023   %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
   7024   %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   7025   %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
   7026   %add6.i = add <4 x i32> %7, %8
   7027   %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   7028   %add7.i = add <4 x i32> %shuffle.i, %add6.i
   7029   %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   7030   %add9.i = add <4 x i32> %shuffle8.i, %add7.i
   7031   %vecext.i = extractelement <4 x i32> %add9.i, i32 0
   7032   ret i32 %vecext.i
   7033 }
   7034 
   7035 define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
   7036 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
   7037 ; X86:       # %bb.0: # %entry
   7038 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   7039 ; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   7040 ; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   7041 ; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   7042 ; X86-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
   7043 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   7044 ; X86-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   7045 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   7046 ; X86-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   7047 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   7048 ; X86-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   7049 ; X86-NEXT:    vmovd %xmm0, %eax
   7050 ; X86-NEXT:    vzeroupper
   7051 ; X86-NEXT:    retl
   7052 ;
   7053 ; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
   7054 ; X64:       # %bb.0: # %entry
   7055 ; X64-NEXT:    kmovw %edi, %k1
   7056 ; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   7057 ; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   7058 ; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   7059 ; X64-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
   7060 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   7061 ; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   7062 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   7063 ; X64-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   7064 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   7065 ; X64-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   7066 ; X64-NEXT:    vmovd %xmm0, %eax
   7067 ; X64-NEXT:    vzeroupper
   7068 ; X64-NEXT:    retq
   7069 entry:
   7070   %0 = bitcast <8 x i64> %__W to <16 x i32>
   7071   %1 = bitcast i16 %__M to <16 x i1>
   7072   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   7073   %3 = bitcast <16 x i32> %2 to <8 x i64>
   7074   %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7075   %4 = bitcast <4 x i64> %extract.i to <8 x i32>
   7076   %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7077   %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
   7078   %mul.i = mul <8 x i32> %4, %5
   7079   %6 = bitcast <8 x i32> %mul.i to <4 x i64>
   7080   %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   7081   %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
   7082   %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   7083   %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
   7084   %mul7.i = mul <4 x i32> %7, %8
   7085   %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   7086   %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
   7087   %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   7088   %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
   7089   %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
   7090   ret i32 %vecext.i
   7091 }
   7092 
   7093 define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
   7094 ; X86-LABEL: test_mm512_mask_reduce_and_epi32:
   7095 ; X86:       # %bb.0: # %entry
   7096 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   7097 ; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
   7098 ; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   7099 ; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   7100 ; X86-NEXT:    vpand %ymm0, %ymm1, %ymm0
   7101 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   7102 ; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
   7103 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   7104 ; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
   7105 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   7106 ; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
   7107 ; X86-NEXT:    vmovd %xmm0, %eax
   7108 ; X86-NEXT:    vzeroupper
   7109 ; X86-NEXT:    retl
   7110 ;
   7111 ; X64-LABEL: test_mm512_mask_reduce_and_epi32:
   7112 ; X64:       # %bb.0: # %entry
   7113 ; X64-NEXT:    kmovw %edi, %k1
   7114 ; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
   7115 ; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   7116 ; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   7117 ; X64-NEXT:    vpand %ymm0, %ymm1, %ymm0
   7118 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   7119 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
   7120 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   7121 ; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
   7122 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   7123 ; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
   7124 ; X64-NEXT:    vmovd %xmm0, %eax
   7125 ; X64-NEXT:    vzeroupper
   7126 ; X64-NEXT:    retq
   7127 entry:
   7128   %0 = bitcast <8 x i64> %__W to <16 x i32>
   7129   %1 = bitcast i16 %__M to <16 x i1>
   7130   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   7131   %3 = bitcast <16 x i32> %2 to <8 x i64>
   7132   %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7133   %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7134   %and28.i = and <4 x i64> %extract.i, %extract4.i
   7135   %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   7136   %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   7137   %and729.i = and <2 x i64> %extract5.i, %extract6.i
   7138   %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
   7139   %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   7140   %and8.i = and <4 x i32> %shuffle.i, %and7.i
   7141   %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   7142   %and10.i = and <4 x i32> %shuffle9.i, %and8.i
   7143   %vecext.i = extractelement <4 x i32> %and10.i, i32 0
   7144   ret i32 %vecext.i
   7145 }
   7146 
   7147 define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
   7148 ; X86-LABEL: test_mm512_mask_reduce_or_epi32:
   7149 ; X86:       # %bb.0: # %entry
   7150 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   7151 ; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
   7152 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   7153 ; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
   7154 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   7155 ; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
   7156 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   7157 ; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
   7158 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   7159 ; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
   7160 ; X86-NEXT:    vmovd %xmm0, %eax
   7161 ; X86-NEXT:    vzeroupper
   7162 ; X86-NEXT:    retl
   7163 ;
   7164 ; X64-LABEL: test_mm512_mask_reduce_or_epi32:
   7165 ; X64:       # %bb.0: # %entry
   7166 ; X64-NEXT:    kmovw %edi, %k1
   7167 ; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
   7168 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   7169 ; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
   7170 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   7171 ; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
   7172 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   7173 ; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
   7174 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   7175 ; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
   7176 ; X64-NEXT:    vmovd %xmm0, %eax
   7177 ; X64-NEXT:    vzeroupper
   7178 ; X64-NEXT:    retq
   7179 entry:
   7180   %0 = bitcast <8 x i64> %__W to <16 x i32>
   7181   %1 = bitcast i16 %__M to <16 x i1>
   7182   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
   7183   %3 = bitcast <16 x i32> %2 to <8 x i64>
   7184   %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7185   %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7186   %or27.i = or <4 x i64> %extract.i, %extract3.i
   7187   %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   7188   %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   7189   %or628.i = or <2 x i64> %extract4.i, %extract5.i
   7190   %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
   7191   %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   7192   %or7.i = or <4 x i32> %shuffle.i, %or6.i
   7193   %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   7194   %or9.i = or <4 x i32> %shuffle8.i, %or7.i
   7195   %vecext.i = extractelement <4 x i32> %or9.i, i32 0
   7196   ret i32 %vecext.i
   7197 }
   7198 
   7199 define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
   7200 ; X86-LABEL: test_mm512_reduce_add_pd:
   7201 ; X86:       # %bb.0: # %entry
   7202 ; X86-NEXT:    pushl %ebp
   7203 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7204 ; X86-NEXT:    .cfi_offset %ebp, -8
   7205 ; X86-NEXT:    movl %esp, %ebp
   7206 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   7207 ; X86-NEXT:    andl $-8, %esp
   7208 ; X86-NEXT:    subl $8, %esp
   7209 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7210 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   7211 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7212 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   7213 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7214 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   7215 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
   7216 ; X86-NEXT:    fldl (%esp)
   7217 ; X86-NEXT:    movl %ebp, %esp
   7218 ; X86-NEXT:    popl %ebp
   7219 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   7220 ; X86-NEXT:    vzeroupper
   7221 ; X86-NEXT:    retl
   7222 ;
   7223 ; X64-LABEL: test_mm512_reduce_add_pd:
   7224 ; X64:       # %bb.0: # %entry
   7225 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7226 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   7227 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7228 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   7229 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7230 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   7231 ; X64-NEXT:    vzeroupper
   7232 ; X64-NEXT:    retq
   7233 entry:
   7234   %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7235   %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7236   %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
   7237   %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   7238   %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   7239   %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
   7240   %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   7241   %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
   7242   %vecext.i = extractelement <2 x double> %add7.i, i32 0
   7243   ret double %vecext.i
   7244 }
   7245 
   7246 define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
   7247 ; X86-LABEL: test_mm512_reduce_mul_pd:
   7248 ; X86:       # %bb.0: # %entry
   7249 ; X86-NEXT:    pushl %ebp
   7250 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7251 ; X86-NEXT:    .cfi_offset %ebp, -8
   7252 ; X86-NEXT:    movl %esp, %ebp
   7253 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   7254 ; X86-NEXT:    andl $-8, %esp
   7255 ; X86-NEXT:    subl $8, %esp
   7256 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7257 ; X86-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
   7258 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7259 ; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
   7260 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7261 ; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
   7262 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
   7263 ; X86-NEXT:    fldl (%esp)
   7264 ; X86-NEXT:    movl %ebp, %esp
   7265 ; X86-NEXT:    popl %ebp
   7266 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   7267 ; X86-NEXT:    vzeroupper
   7268 ; X86-NEXT:    retl
   7269 ;
   7270 ; X64-LABEL: test_mm512_reduce_mul_pd:
   7271 ; X64:       # %bb.0: # %entry
   7272 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7273 ; X64-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
   7274 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7275 ; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
   7276 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7277 ; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
   7278 ; X64-NEXT:    vzeroupper
   7279 ; X64-NEXT:    retq
   7280 entry:
   7281   %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7282   %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7283   %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
   7284   %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   7285   %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   7286   %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
   7287   %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   7288   %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
   7289   %vecext.i = extractelement <2 x double> %mul7.i, i32 0
   7290   ret double %vecext.i
   7291 }
   7292 
   7293 define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
   7294 ; X86-LABEL: test_mm512_reduce_add_ps:
   7295 ; X86:       # %bb.0: # %entry
   7296 ; X86-NEXT:    pushl %eax
   7297 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7298 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7299 ; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   7300 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7301 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7302 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7303 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7304 ; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   7305 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7306 ; X86-NEXT:    vmovss %xmm0, (%esp)
   7307 ; X86-NEXT:    flds (%esp)
   7308 ; X86-NEXT:    popl %eax
   7309 ; X86-NEXT:    .cfi_def_cfa_offset 4
   7310 ; X86-NEXT:    vzeroupper
   7311 ; X86-NEXT:    retl
   7312 ;
   7313 ; X64-LABEL: test_mm512_reduce_add_ps:
   7314 ; X64:       # %bb.0: # %entry
   7315 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7316 ; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   7317 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7318 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7319 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7320 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7321 ; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   7322 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7323 ; X64-NEXT:    vzeroupper
   7324 ; X64-NEXT:    retq
   7325 entry:
   7326   %0 = bitcast <16 x float> %__W to <8 x double>
   7327   %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7328   %1 = bitcast <4 x double> %extract.i to <8 x float>
   7329   %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7330   %2 = bitcast <4 x double> %extract2.i to <8 x float>
   7331   %add.i = fadd <8 x float> %1, %2
   7332   %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7333   %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7334   %add5.i = fadd <4 x float> %extract3.i, %extract4.i
   7335   %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   7336   %add6.i = fadd <4 x float> %add5.i, %shuffle.i
   7337   %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   7338   %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
   7339   %vecext.i = extractelement <4 x float> %add8.i, i32 0
   7340   ret float %vecext.i
   7341 }
   7342 
   7343 define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
   7344 ; X86-LABEL: test_mm512_reduce_mul_ps:
   7345 ; X86:       # %bb.0: # %entry
   7346 ; X86-NEXT:    pushl %eax
   7347 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7348 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7349 ; X86-NEXT:    vmulps %ymm1, %ymm0, %ymm0
   7350 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7351 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7352 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7353 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7354 ; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   7355 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7356 ; X86-NEXT:    vmovss %xmm0, (%esp)
   7357 ; X86-NEXT:    flds (%esp)
   7358 ; X86-NEXT:    popl %eax
   7359 ; X86-NEXT:    .cfi_def_cfa_offset 4
   7360 ; X86-NEXT:    vzeroupper
   7361 ; X86-NEXT:    retl
   7362 ;
   7363 ; X64-LABEL: test_mm512_reduce_mul_ps:
   7364 ; X64:       # %bb.0: # %entry
   7365 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7366 ; X64-NEXT:    vmulps %ymm1, %ymm0, %ymm0
   7367 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7368 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7369 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7370 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7371 ; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   7372 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7373 ; X64-NEXT:    vzeroupper
   7374 ; X64-NEXT:    retq
   7375 entry:
   7376   %0 = bitcast <16 x float> %__W to <8 x double>
   7377   %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7378   %1 = bitcast <4 x double> %extract.i to <8 x float>
   7379   %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7380   %2 = bitcast <4 x double> %extract2.i to <8 x float>
   7381   %mul.i = fmul <8 x float> %1, %2
   7382   %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7383   %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7384   %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
   7385   %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   7386   %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
   7387   %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   7388   %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
   7389   %vecext.i = extractelement <4 x float> %mul8.i, i32 0
   7390   ret float %vecext.i
   7391 }
   7392 
   7393 define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
   7394 ; X86-LABEL: test_mm512_mask_reduce_add_pd:
   7395 ; X86:       # %bb.0: # %entry
   7396 ; X86-NEXT:    pushl %ebp
   7397 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7398 ; X86-NEXT:    .cfi_offset %ebp, -8
   7399 ; X86-NEXT:    movl %esp, %ebp
   7400 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   7401 ; X86-NEXT:    andl $-8, %esp
   7402 ; X86-NEXT:    subl $8, %esp
   7403 ; X86-NEXT:    movb 8(%ebp), %al
   7404 ; X86-NEXT:    kmovw %eax, %k1
   7405 ; X86-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
   7406 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7407 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   7408 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7409 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   7410 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7411 ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   7412 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
   7413 ; X86-NEXT:    fldl (%esp)
   7414 ; X86-NEXT:    movl %ebp, %esp
   7415 ; X86-NEXT:    popl %ebp
   7416 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   7417 ; X86-NEXT:    vzeroupper
   7418 ; X86-NEXT:    retl
   7419 ;
   7420 ; X64-LABEL: test_mm512_mask_reduce_add_pd:
   7421 ; X64:       # %bb.0: # %entry
   7422 ; X64-NEXT:    kmovw %edi, %k1
   7423 ; X64-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
   7424 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7425 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   7426 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7427 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   7428 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7429 ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
   7430 ; X64-NEXT:    vzeroupper
   7431 ; X64-NEXT:    retq
   7432 entry:
   7433   %0 = bitcast i8 %__M to <8 x i1>
   7434   %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
   7435   %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7436   %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7437   %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
   7438   %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   7439   %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   7440   %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
   7441   %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   7442   %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
   7443   %vecext.i = extractelement <2 x double> %add7.i, i32 0
   7444   ret double %vecext.i
   7445 }
   7446 
   7447 define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
   7448 ; X86-LABEL: test_mm512_mask_reduce_mul_pd:
   7449 ; X86:       # %bb.0: # %entry
   7450 ; X86-NEXT:    pushl %ebp
   7451 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7452 ; X86-NEXT:    .cfi_offset %ebp, -8
   7453 ; X86-NEXT:    movl %esp, %ebp
   7454 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   7455 ; X86-NEXT:    andl $-8, %esp
   7456 ; X86-NEXT:    subl $8, %esp
   7457 ; X86-NEXT:    movb 8(%ebp), %al
   7458 ; X86-NEXT:    kmovw %eax, %k1
   7459 ; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
   7460 ; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
   7461 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   7462 ; X86-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
   7463 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7464 ; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
   7465 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7466 ; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
   7467 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
   7468 ; X86-NEXT:    fldl (%esp)
   7469 ; X86-NEXT:    movl %ebp, %esp
   7470 ; X86-NEXT:    popl %ebp
   7471 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   7472 ; X86-NEXT:    vzeroupper
   7473 ; X86-NEXT:    retl
   7474 ;
   7475 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
   7476 ; X64:       # %bb.0: # %entry
   7477 ; X64-NEXT:    kmovw %edi, %k1
   7478 ; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
   7479 ; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
   7480 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   7481 ; X64-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
   7482 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7483 ; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
   7484 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7485 ; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
   7486 ; X64-NEXT:    vzeroupper
   7487 ; X64-NEXT:    retq
   7488 entry:
   7489   %0 = bitcast i8 %__M to <8 x i1>
   7490   %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
   7491   %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7492   %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7493   %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
   7494   %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   7495   %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   7496   %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
   7497   %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   7498   %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
   7499   %vecext.i = extractelement <2 x double> %mul7.i, i32 0
   7500   ret double %vecext.i
   7501 }
   7502 
   7503 define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
   7504 ; X86-LABEL: test_mm512_mask_reduce_add_ps:
   7505 ; X86:       # %bb.0: # %entry
   7506 ; X86-NEXT:    pushl %eax
   7507 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7508 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   7509 ; X86-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
   7510 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7511 ; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   7512 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7513 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7514 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7515 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7516 ; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   7517 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7518 ; X86-NEXT:    vmovss %xmm0, (%esp)
   7519 ; X86-NEXT:    flds (%esp)
   7520 ; X86-NEXT:    popl %eax
   7521 ; X86-NEXT:    .cfi_def_cfa_offset 4
   7522 ; X86-NEXT:    vzeroupper
   7523 ; X86-NEXT:    retl
   7524 ;
   7525 ; X64-LABEL: test_mm512_mask_reduce_add_ps:
   7526 ; X64:       # %bb.0: # %entry
   7527 ; X64-NEXT:    kmovw %edi, %k1
   7528 ; X64-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
   7529 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7530 ; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   7531 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7532 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7533 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7534 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7535 ; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   7536 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   7537 ; X64-NEXT:    vzeroupper
   7538 ; X64-NEXT:    retq
   7539 entry:
   7540   %0 = bitcast i16 %__M to <16 x i1>
   7541   %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
   7542   %2 = bitcast <16 x float> %1 to <8 x double>
   7543   %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7544   %3 = bitcast <4 x double> %extract.i to <8 x float>
   7545   %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7546   %4 = bitcast <4 x double> %extract3.i to <8 x float>
   7547   %add.i = fadd <8 x float> %3, %4
   7548   %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7549   %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7550   %add6.i = fadd <4 x float> %extract4.i, %extract5.i
   7551   %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   7552   %add7.i = fadd <4 x float> %add6.i, %shuffle.i
   7553   %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   7554   %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
   7555   %vecext.i = extractelement <4 x float> %add9.i, i32 0
   7556   ret float %vecext.i
   7557 }
   7558 
   7559 define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
   7560 ; X86-LABEL: test_mm512_mask_reduce_mul_ps:
   7561 ; X86:       # %bb.0: # %entry
   7562 ; X86-NEXT:    pushl %eax
   7563 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7564 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   7565 ; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   7566 ; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
   7567 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   7568 ; X86-NEXT:    vmulps %ymm0, %ymm1, %ymm0
   7569 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7570 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7571 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7572 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7573 ; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   7574 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7575 ; X86-NEXT:    vmovss %xmm0, (%esp)
   7576 ; X86-NEXT:    flds (%esp)
   7577 ; X86-NEXT:    popl %eax
   7578 ; X86-NEXT:    .cfi_def_cfa_offset 4
   7579 ; X86-NEXT:    vzeroupper
   7580 ; X86-NEXT:    retl
   7581 ;
   7582 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
   7583 ; X64:       # %bb.0: # %entry
   7584 ; X64-NEXT:    kmovw %edi, %k1
   7585 ; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   7586 ; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
   7587 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   7588 ; X64-NEXT:    vmulps %ymm0, %ymm1, %ymm0
   7589 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7590 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7591 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7592 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7593 ; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   7594 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
   7595 ; X64-NEXT:    vzeroupper
   7596 ; X64-NEXT:    retq
   7597 entry:
   7598   %0 = bitcast i16 %__M to <16 x i1>
   7599   %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
   7600   %2 = bitcast <16 x float> %1 to <8 x double>
   7601   %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7602   %3 = bitcast <4 x double> %extract.i to <8 x float>
   7603   %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7604   %4 = bitcast <4 x double> %extract4.i to <8 x float>
   7605   %mul.i = fmul <8 x float> %3, %4
   7606   %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7607   %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7608   %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
   7609   %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   7610   %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
   7611   %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   7612   %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
   7613   %vecext.i = extractelement <4 x float> %mul10.i, i32 0
   7614   ret float %vecext.i
   7615 }
   7616 
   7617 define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
   7618 ; X86-LABEL: test_mm512_reduce_max_epi64:
   7619 ; X86:       # %bb.0: # %entry
   7620 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7621 ; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
   7622 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7623 ; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   7624 ; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7625 ; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   7626 ; X86-NEXT:    vmovd %xmm0, %eax
   7627 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   7628 ; X86-NEXT:    vzeroupper
   7629 ; X86-NEXT:    retl
   7630 ;
   7631 ; X64-LABEL: test_mm512_reduce_max_epi64:
   7632 ; X64:       # %bb.0: # %entry
   7633 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7634 ; X64-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
   7635 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7636 ; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   7637 ; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7638 ; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   7639 ; X64-NEXT:    vmovq %xmm0, %rax
   7640 ; X64-NEXT:    vzeroupper
   7641 ; X64-NEXT:    retq
   7642 entry:
   7643   %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   7644   %0 = icmp slt <8 x i64> %shuffle.i, %__W
   7645   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
   7646   %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   7647   %2 = icmp sgt <8 x i64> %1, %shuffle1.i
   7648   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
   7649   %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   7650   %4 = icmp sgt <8 x i64> %3, %shuffle3.i
   7651   %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
   7652   %vecext.i = extractelement <8 x i64> %5, i32 0
   7653   ret i64 %vecext.i
   7654 }
   7655 
   7656 define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
   7657 ; X86-LABEL: test_mm512_reduce_max_epu64:
   7658 ; X86:       # %bb.0: # %entry
   7659 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7660 ; X86-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
   7661 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7662 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7663 ; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7664 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7665 ; X86-NEXT:    vmovd %xmm0, %eax
   7666 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   7667 ; X86-NEXT:    vzeroupper
   7668 ; X86-NEXT:    retl
   7669 ;
   7670 ; X64-LABEL: test_mm512_reduce_max_epu64:
   7671 ; X64:       # %bb.0: # %entry
   7672 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7673 ; X64-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
   7674 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7675 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7676 ; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7677 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7678 ; X64-NEXT:    vmovq %xmm0, %rax
   7679 ; X64-NEXT:    vzeroupper
   7680 ; X64-NEXT:    retq
   7681 entry:
   7682   %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   7683   %0 = icmp ult <8 x i64> %shuffle.i, %__W
   7684   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
   7685   %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   7686   %2 = icmp ugt <8 x i64> %1, %shuffle1.i
   7687   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
   7688   %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   7689   %4 = icmp ugt <8 x i64> %3, %shuffle3.i
   7690   %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
   7691   %vecext.i = extractelement <8 x i64> %5, i32 0
   7692   ret i64 %vecext.i
   7693 }
   7694 
   7695 define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
   7696 ; X86-LABEL: test_mm512_reduce_max_pd:
   7697 ; X86:       # %bb.0: # %entry
   7698 ; X86-NEXT:    pushl %ebp
   7699 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7700 ; X86-NEXT:    .cfi_offset %ebp, -8
   7701 ; X86-NEXT:    movl %esp, %ebp
   7702 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   7703 ; X86-NEXT:    andl $-8, %esp
   7704 ; X86-NEXT:    subl $8, %esp
   7705 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7706 ; X86-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
   7707 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7708 ; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
   7709 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7710 ; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
   7711 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
   7712 ; X86-NEXT:    fldl (%esp)
   7713 ; X86-NEXT:    movl %ebp, %esp
   7714 ; X86-NEXT:    popl %ebp
   7715 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   7716 ; X86-NEXT:    vzeroupper
   7717 ; X86-NEXT:    retl
   7718 ;
   7719 ; X64-LABEL: test_mm512_reduce_max_pd:
   7720 ; X64:       # %bb.0: # %entry
   7721 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7722 ; X64-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
   7723 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7724 ; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
   7725 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7726 ; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
   7727 ; X64-NEXT:    vzeroupper
   7728 ; X64-NEXT:    retq
   7729 entry:
   7730   %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7731   %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7732   %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
   7733   %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   7734   %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   7735   %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
   7736   %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   7737   %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
   7738   %vecext.i = extractelement <2 x double> %2, i32 0
   7739   ret double %vecext.i
   7740 }
   7741 
   7742 define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
   7743 ; X86-LABEL: test_mm512_reduce_min_epi64:
   7744 ; X86:       # %bb.0: # %entry
   7745 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7746 ; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
   7747 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7748 ; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   7749 ; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7750 ; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   7751 ; X86-NEXT:    vmovd %xmm0, %eax
   7752 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   7753 ; X86-NEXT:    vzeroupper
   7754 ; X86-NEXT:    retl
   7755 ;
   7756 ; X64-LABEL: test_mm512_reduce_min_epi64:
   7757 ; X64:       # %bb.0: # %entry
   7758 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7759 ; X64-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
   7760 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7761 ; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   7762 ; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7763 ; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   7764 ; X64-NEXT:    vmovq %xmm0, %rax
   7765 ; X64-NEXT:    vzeroupper
   7766 ; X64-NEXT:    retq
   7767 entry:
   7768   %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   7769   %0 = icmp sgt <8 x i64> %shuffle.i, %__W
   7770   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
   7771   %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   7772   %2 = icmp slt <8 x i64> %1, %shuffle1.i
   7773   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
   7774   %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   7775   %4 = icmp slt <8 x i64> %3, %shuffle3.i
   7776   %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
   7777   %vecext.i = extractelement <8 x i64> %5, i32 0
   7778   ret i64 %vecext.i
   7779 }
   7780 
   7781 define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
   7782 ; X86-LABEL: test_mm512_reduce_min_epu64:
   7783 ; X86:       # %bb.0: # %entry
   7784 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7785 ; X86-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
   7786 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7787 ; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   7788 ; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7789 ; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   7790 ; X86-NEXT:    vmovd %xmm0, %eax
   7791 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   7792 ; X86-NEXT:    vzeroupper
   7793 ; X86-NEXT:    retl
   7794 ;
   7795 ; X64-LABEL: test_mm512_reduce_min_epu64:
   7796 ; X64:       # %bb.0: # %entry
   7797 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7798 ; X64-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
   7799 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7800 ; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   7801 ; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7802 ; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   7803 ; X64-NEXT:    vmovq %xmm0, %rax
   7804 ; X64-NEXT:    vzeroupper
   7805 ; X64-NEXT:    retq
   7806 entry:
   7807   %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   7808   %0 = icmp ugt <8 x i64> %shuffle.i, %__W
   7809   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
   7810   %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   7811   %2 = icmp ult <8 x i64> %1, %shuffle1.i
   7812   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
   7813   %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   7814   %4 = icmp ult <8 x i64> %3, %shuffle3.i
   7815   %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
   7816   %vecext.i = extractelement <8 x i64> %5, i32 0
   7817   ret i64 %vecext.i
   7818 }
   7819 
   7820 define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
   7821 ; X86-LABEL: test_mm512_reduce_min_pd:
   7822 ; X86:       # %bb.0: # %entry
   7823 ; X86-NEXT:    pushl %ebp
   7824 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7825 ; X86-NEXT:    .cfi_offset %ebp, -8
   7826 ; X86-NEXT:    movl %esp, %ebp
   7827 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   7828 ; X86-NEXT:    andl $-8, %esp
   7829 ; X86-NEXT:    subl $8, %esp
   7830 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7831 ; X86-NEXT:    vminpd %ymm1, %ymm0, %ymm0
   7832 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7833 ; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
   7834 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7835 ; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
   7836 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
   7837 ; X86-NEXT:    fldl (%esp)
   7838 ; X86-NEXT:    movl %ebp, %esp
   7839 ; X86-NEXT:    popl %ebp
   7840 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   7841 ; X86-NEXT:    vzeroupper
   7842 ; X86-NEXT:    retl
   7843 ;
   7844 ; X64-LABEL: test_mm512_reduce_min_pd:
   7845 ; X64:       # %bb.0: # %entry
   7846 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   7847 ; X64-NEXT:    vminpd %ymm1, %ymm0, %ymm0
   7848 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7849 ; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
   7850 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7851 ; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
   7852 ; X64-NEXT:    vzeroupper
   7853 ; X64-NEXT:    retq
   7854 entry:
   7855   %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7856   %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7857   %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
   7858   %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   7859   %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   7860   %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
   7861   %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   7862   %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
   7863   %vecext.i = extractelement <2 x double> %2, i32 0
   7864   ret double %vecext.i
   7865 }
   7866 
   7867 define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
   7868 ; X86-LABEL: test_mm512_mask_reduce_max_epi64:
   7869 ; X86:       # %bb.0: # %entry
   7870 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7871 ; X86-NEXT:    kmovw %eax, %k1
   7872 ; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
   7873 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   7874 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
   7875 ; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
   7876 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7877 ; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   7878 ; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7879 ; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   7880 ; X86-NEXT:    vmovd %xmm0, %eax
   7881 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   7882 ; X86-NEXT:    vzeroupper
   7883 ; X86-NEXT:    retl
   7884 ;
   7885 ; X64-LABEL: test_mm512_mask_reduce_max_epi64:
   7886 ; X64:       # %bb.0: # %entry
   7887 ; X64-NEXT:    kmovw %edi, %k1
   7888 ; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
   7889 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   7890 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
   7891 ; X64-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
   7892 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7893 ; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   7894 ; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7895 ; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
   7896 ; X64-NEXT:    vmovq %xmm0, %rax
   7897 ; X64-NEXT:    vzeroupper
   7898 ; X64-NEXT:    retq
   7899 entry:
   7900   %0 = bitcast i8 %__M to <8 x i1>
   7901   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
   7902   %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   7903   %2 = icmp sgt <8 x i64> %1, %shuffle.i
   7904   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
   7905   %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   7906   %4 = icmp sgt <8 x i64> %3, %shuffle3.i
   7907   %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
   7908   %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   7909   %6 = icmp sgt <8 x i64> %5, %shuffle5.i
   7910   %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
   7911   %vecext.i = extractelement <8 x i64> %7, i32 0
   7912   ret i64 %vecext.i
   7913 }
   7914 
   7915 define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
   7916 ; X86-LABEL: test_mm512_mask_reduce_max_epu64:
   7917 ; X86:       # %bb.0: # %entry
   7918 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   7919 ; X86-NEXT:    kmovw %eax, %k1
   7920 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
   7921 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7922 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7923 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7924 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7925 ; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7926 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7927 ; X86-NEXT:    vmovd %xmm0, %eax
   7928 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   7929 ; X86-NEXT:    vzeroupper
   7930 ; X86-NEXT:    retl
   7931 ;
   7932 ; X64-LABEL: test_mm512_mask_reduce_max_epu64:
   7933 ; X64:       # %bb.0: # %entry
   7934 ; X64-NEXT:    kmovw %edi, %k1
   7935 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
   7936 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
   7937 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7938 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   7939 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7940 ; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   7941 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
   7942 ; X64-NEXT:    vmovq %xmm0, %rax
   7943 ; X64-NEXT:    vzeroupper
   7944 ; X64-NEXT:    retq
   7945 entry:
   7946   %0 = bitcast i8 %__M to <8 x i1>
   7947   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
   7948   %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   7949   %2 = icmp ugt <8 x i64> %1, %shuffle.i
   7950   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
   7951   %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   7952   %4 = icmp ugt <8 x i64> %3, %shuffle2.i
   7953   %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
   7954   %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   7955   %6 = icmp ugt <8 x i64> %5, %shuffle4.i
   7956   %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
   7957   %vecext.i = extractelement <8 x i64> %7, i32 0
   7958   ret i64 %vecext.i
   7959 }
   7960 
   7961 define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
   7962 ; X86-LABEL: test_mm512_mask_reduce_max_pd:
   7963 ; X86:       # %bb.0: # %entry
   7964 ; X86-NEXT:    pushl %ebp
   7965 ; X86-NEXT:    .cfi_def_cfa_offset 8
   7966 ; X86-NEXT:    .cfi_offset %ebp, -8
   7967 ; X86-NEXT:    movl %esp, %ebp
   7968 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   7969 ; X86-NEXT:    andl $-8, %esp
   7970 ; X86-NEXT:    subl $8, %esp
   7971 ; X86-NEXT:    movb 8(%ebp), %al
   7972 ; X86-NEXT:    kmovw %eax, %k1
   7973 ; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
   7974 ; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
   7975 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   7976 ; X86-NEXT:    vmaxpd %ymm0, %ymm1, %ymm0
   7977 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7978 ; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
   7979 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7980 ; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
   7981 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
   7982 ; X86-NEXT:    fldl (%esp)
   7983 ; X86-NEXT:    movl %ebp, %esp
   7984 ; X86-NEXT:    popl %ebp
   7985 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   7986 ; X86-NEXT:    vzeroupper
   7987 ; X86-NEXT:    retl
   7988 ;
   7989 ; X64-LABEL: test_mm512_mask_reduce_max_pd:
   7990 ; X64:       # %bb.0: # %entry
   7991 ; X64-NEXT:    kmovw %edi, %k1
   7992 ; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
   7993 ; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
   7994 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   7995 ; X64-NEXT:    vmaxpd %ymm0, %ymm1, %ymm0
   7996 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   7997 ; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
   7998 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   7999 ; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
   8000 ; X64-NEXT:    vzeroupper
   8001 ; X64-NEXT:    retq
   8002 entry:
   8003   %0 = bitcast i8 %__M to <8 x i1>
   8004   %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
   8005   %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8006   %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8007   %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
   8008   %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   8009   %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   8010   %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
   8011   %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   8012   %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
   8013   %vecext.i = extractelement <2 x double> %4, i32 0
   8014   ret double %vecext.i
   8015 }
   8016 
   8017 define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
   8018 ; X86-LABEL: test_mm512_mask_reduce_min_epi64:
   8019 ; X86:       # %bb.0: # %entry
   8020 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   8021 ; X86-NEXT:    kmovw %eax, %k1
   8022 ; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
   8023 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   8024 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
   8025 ; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
   8026 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   8027 ; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   8028 ; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   8029 ; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   8030 ; X86-NEXT:    vmovd %xmm0, %eax
   8031 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   8032 ; X86-NEXT:    vzeroupper
   8033 ; X86-NEXT:    retl
   8034 ;
   8035 ; X64-LABEL: test_mm512_mask_reduce_min_epi64:
   8036 ; X64:       # %bb.0: # %entry
   8037 ; X64-NEXT:    kmovw %edi, %k1
   8038 ; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
   8039 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   8040 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
   8041 ; X64-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
   8042 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   8043 ; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   8044 ; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   8045 ; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
   8046 ; X64-NEXT:    vmovq %xmm0, %rax
   8047 ; X64-NEXT:    vzeroupper
   8048 ; X64-NEXT:    retq
   8049 entry:
   8050   %0 = bitcast i8 %__M to <8 x i1>
   8051   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
   8052   %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   8053   %2 = icmp slt <8 x i64> %1, %shuffle.i
   8054   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
   8055   %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   8056   %4 = icmp slt <8 x i64> %3, %shuffle3.i
   8057   %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
   8058   %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   8059   %6 = icmp slt <8 x i64> %5, %shuffle5.i
   8060   %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
   8061   %vecext.i = extractelement <8 x i64> %7, i32 0
   8062   ret i64 %vecext.i
   8063 }
   8064 
   8065 define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
   8066 ; X86-LABEL: test_mm512_mask_reduce_min_epu64:
   8067 ; X86:       # %bb.0: # %entry
   8068 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   8069 ; X86-NEXT:    kmovw %eax, %k1
   8070 ; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
   8071 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   8072 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
   8073 ; X86-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
   8074 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   8075 ; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   8076 ; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   8077 ; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   8078 ; X86-NEXT:    vmovd %xmm0, %eax
   8079 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
   8080 ; X86-NEXT:    vzeroupper
   8081 ; X86-NEXT:    retl
   8082 ;
   8083 ; X64-LABEL: test_mm512_mask_reduce_min_epu64:
   8084 ; X64:       # %bb.0: # %entry
   8085 ; X64-NEXT:    kmovw %edi, %k1
   8086 ; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
   8087 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
   8088 ; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
   8089 ; X64-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
   8090 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
   8091 ; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   8092 ; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
   8093 ; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
   8094 ; X64-NEXT:    vmovq %xmm0, %rax
   8095 ; X64-NEXT:    vzeroupper
   8096 ; X64-NEXT:    retq
   8097 entry:
   8098   %0 = bitcast i8 %__M to <8 x i1>
   8099   %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
   8100   %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   8101   %2 = icmp ult <8 x i64> %1, %shuffle.i
   8102   %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
   8103   %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   8104   %4 = icmp ult <8 x i64> %3, %shuffle3.i
   8105   %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
   8106   %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   8107   %6 = icmp ult <8 x i64> %5, %shuffle5.i
   8108   %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
   8109   %vecext.i = extractelement <8 x i64> %7, i32 0
   8110   ret i64 %vecext.i
   8111 }
   8112 
   8113 define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
   8114 ; X86-LABEL: test_mm512_mask_reduce_min_pd:
   8115 ; X86:       # %bb.0: # %entry
   8116 ; X86-NEXT:    pushl %ebp
   8117 ; X86-NEXT:    .cfi_def_cfa_offset 8
   8118 ; X86-NEXT:    .cfi_offset %ebp, -8
   8119 ; X86-NEXT:    movl %esp, %ebp
   8120 ; X86-NEXT:    .cfi_def_cfa_register %ebp
   8121 ; X86-NEXT:    andl $-8, %esp
   8122 ; X86-NEXT:    subl $8, %esp
   8123 ; X86-NEXT:    movb 8(%ebp), %al
   8124 ; X86-NEXT:    kmovw %eax, %k1
   8125 ; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
   8126 ; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
   8127 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   8128 ; X86-NEXT:    vminpd %ymm0, %ymm1, %ymm0
   8129 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8130 ; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
   8131 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8132 ; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
   8133 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
   8134 ; X86-NEXT:    fldl (%esp)
   8135 ; X86-NEXT:    movl %ebp, %esp
   8136 ; X86-NEXT:    popl %ebp
   8137 ; X86-NEXT:    .cfi_def_cfa %esp, 4
   8138 ; X86-NEXT:    vzeroupper
   8139 ; X86-NEXT:    retl
   8140 ;
   8141 ; X64-LABEL: test_mm512_mask_reduce_min_pd:
   8142 ; X64:       # %bb.0: # %entry
   8143 ; X64-NEXT:    kmovw %edi, %k1
   8144 ; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
   8145 ; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
   8146 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   8147 ; X64-NEXT:    vminpd %ymm0, %ymm1, %ymm0
   8148 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8149 ; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
   8150 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8151 ; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
   8152 ; X64-NEXT:    vzeroupper
   8153 ; X64-NEXT:    retq
   8154 entry:
   8155   %0 = bitcast i8 %__M to <8 x i1>
   8156   %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
   8157   %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8158   %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8159   %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
   8160   %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   8161   %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   8162   %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
   8163   %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   8164   %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
   8165   %vecext.i = extractelement <2 x double> %4, i32 0
   8166   ret double %vecext.i
   8167 }
   8168 
   8169 define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
   8170 ; CHECK-LABEL: test_mm512_reduce_max_epi32:
   8171 ; CHECK:       # %bb.0: # %entry
   8172 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   8173 ; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   8174 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8175 ; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8176 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8177 ; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8178 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8179 ; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8180 ; CHECK-NEXT:    vmovd %xmm0, %eax
   8181 ; CHECK-NEXT:    vzeroupper
   8182 ; CHECK-NEXT:    ret{{[l|q]}}
   8183 entry:
   8184   %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8185   %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8186   %0 = bitcast <4 x i64> %extract.i to <8 x i32>
   8187   %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
   8188   %2 = icmp sgt <8 x i32> %0, %1
   8189   %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
   8190   %4 = bitcast <8 x i32> %3 to <4 x i64>
   8191   %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   8192   %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   8193   %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
   8194   %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
   8195   %7 = icmp sgt <4 x i32> %5, %6
   8196   %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
   8197   %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8198   %9 = icmp sgt <4 x i32> %8, %shuffle.i
   8199   %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
   8200   %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8201   %11 = icmp sgt <4 x i32> %10, %shuffle8.i
   8202   %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
   8203   %vecext.i = extractelement <4 x i32> %12, i32 0
   8204   ret i32 %vecext.i
   8205 }
   8206 
   8207 define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
   8208 ; CHECK-LABEL: test_mm512_reduce_max_epu32:
   8209 ; CHECK:       # %bb.0: # %entry
   8210 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   8211 ; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
   8212 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8213 ; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8214 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8215 ; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8216 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8217 ; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8218 ; CHECK-NEXT:    vmovd %xmm0, %eax
   8219 ; CHECK-NEXT:    vzeroupper
   8220 ; CHECK-NEXT:    ret{{[l|q]}}
   8221 entry:
   8222   %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8223   %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8224   %0 = bitcast <4 x i64> %extract.i to <8 x i32>
   8225   %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
   8226   %2 = icmp ugt <8 x i32> %0, %1
   8227   %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
   8228   %4 = bitcast <8 x i32> %3 to <4 x i64>
   8229   %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   8230   %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   8231   %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
   8232   %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
   8233   %7 = icmp ugt <4 x i32> %5, %6
   8234   %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
   8235   %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8236   %9 = icmp ugt <4 x i32> %8, %shuffle.i
   8237   %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
   8238   %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8239   %11 = icmp ugt <4 x i32> %10, %shuffle8.i
   8240   %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
   8241   %vecext.i = extractelement <4 x i32> %12, i32 0
   8242   ret i32 %vecext.i
   8243 }
   8244 
   8245 define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
   8246 ; X86-LABEL: test_mm512_reduce_max_ps:
   8247 ; X86:       # %bb.0: # %entry
   8248 ; X86-NEXT:    pushl %eax
   8249 ; X86-NEXT:    .cfi_def_cfa_offset 8
   8250 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   8251 ; X86-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
   8252 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8253 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8254 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8255 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8256 ; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8257 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8258 ; X86-NEXT:    vmovss %xmm0, (%esp)
   8259 ; X86-NEXT:    flds (%esp)
   8260 ; X86-NEXT:    popl %eax
   8261 ; X86-NEXT:    .cfi_def_cfa_offset 4
   8262 ; X86-NEXT:    vzeroupper
   8263 ; X86-NEXT:    retl
   8264 ;
   8265 ; X64-LABEL: test_mm512_reduce_max_ps:
   8266 ; X64:       # %bb.0: # %entry
   8267 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   8268 ; X64-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
   8269 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8270 ; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8271 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8272 ; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8273 ; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8274 ; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8275 ; X64-NEXT:    vzeroupper
   8276 ; X64-NEXT:    retq
   8277 entry:
   8278   %0 = bitcast <16 x float> %__W to <8 x double>
   8279   %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8280   %1 = bitcast <4 x double> %extract.i to <8 x float>
   8281   %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8282   %2 = bitcast <4 x double> %extract2.i to <8 x float>
   8283   %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
   8284   %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8285   %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8286   %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
   8287   %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8288   %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
   8289   %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8290   %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
   8291   %vecext.i = extractelement <4 x float> %6, i32 0
   8292   ret float %vecext.i
   8293 }
   8294 
   8295 define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
   8296 ; CHECK-LABEL: test_mm512_reduce_min_epi32:
   8297 ; CHECK:       # %bb.0: # %entry
   8298 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   8299 ; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
   8300 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8301 ; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8302 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8303 ; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8304 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8305 ; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8306 ; CHECK-NEXT:    vmovd %xmm0, %eax
   8307 ; CHECK-NEXT:    vzeroupper
   8308 ; CHECK-NEXT:    ret{{[l|q]}}
   8309 entry:
   8310   %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8311   %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8312   %0 = bitcast <4 x i64> %extract.i to <8 x i32>
   8313   %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
   8314   %2 = icmp slt <8 x i32> %0, %1
   8315   %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
   8316   %4 = bitcast <8 x i32> %3 to <4 x i64>
   8317   %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   8318   %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   8319   %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
   8320   %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
   8321   %7 = icmp slt <4 x i32> %5, %6
   8322   %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
   8323   %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8324   %9 = icmp slt <4 x i32> %8, %shuffle.i
   8325   %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
   8326   %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8327   %11 = icmp slt <4 x i32> %10, %shuffle8.i
   8328   %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
   8329   %vecext.i = extractelement <4 x i32> %12, i32 0
   8330   ret i32 %vecext.i
   8331 }
   8332 
   8333 define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
   8334 ; CHECK-LABEL: test_mm512_reduce_min_epu32:
   8335 ; CHECK:       # %bb.0: # %entry
   8336 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   8337 ; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
   8338 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8339 ; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8340 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8341 ; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8342 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8343 ; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8344 ; CHECK-NEXT:    vmovd %xmm0, %eax
   8345 ; CHECK-NEXT:    vzeroupper
   8346 ; CHECK-NEXT:    ret{{[l|q]}}
   8347 entry:
   8348   %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8349   %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8350   %0 = bitcast <4 x i64> %extract.i to <8 x i32>
   8351   %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
   8352   %2 = icmp ult <8 x i32> %0, %1
   8353   %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
   8354   %4 = bitcast <8 x i32> %3 to <4 x i64>
   8355   %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   8356   %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   8357   %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
   8358   %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
   8359   %7 = icmp ult <4 x i32> %5, %6
   8360   %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
   8361   %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8362   %9 = icmp ult <4 x i32> %8, %shuffle.i
   8363   %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
   8364   %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8365   %11 = icmp ult <4 x i32> %10, %shuffle8.i
   8366   %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
   8367   %vecext.i = extractelement <4 x i32> %12, i32 0
   8368   ret i32 %vecext.i
   8369 }
   8370 
   8371 define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
   8372 ; X86-LABEL: test_mm512_reduce_min_ps:
   8373 ; X86:       # %bb.0: # %entry
   8374 ; X86-NEXT:    pushl %eax
   8375 ; X86-NEXT:    .cfi_def_cfa_offset 8
   8376 ; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   8377 ; X86-NEXT:    vminps %ymm1, %ymm0, %ymm0
   8378 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8379 ; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8380 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8381 ; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8382 ; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8383 ; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8384 ; X86-NEXT:    vmovss %xmm0, (%esp)
   8385 ; X86-NEXT:    flds (%esp)
   8386 ; X86-NEXT:    popl %eax
   8387 ; X86-NEXT:    .cfi_def_cfa_offset 4
   8388 ; X86-NEXT:    vzeroupper
   8389 ; X86-NEXT:    retl
   8390 ;
   8391 ; X64-LABEL: test_mm512_reduce_min_ps:
   8392 ; X64:       # %bb.0: # %entry
   8393 ; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   8394 ; X64-NEXT:    vminps %ymm1, %ymm0, %ymm0
   8395 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8396 ; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8397 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8398 ; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8399 ; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8400 ; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8401 ; X64-NEXT:    vzeroupper
   8402 ; X64-NEXT:    retq
   8403 entry:
   8404   %0 = bitcast <16 x float> %__W to <8 x double>
   8405   %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8406   %1 = bitcast <4 x double> %extract.i to <8 x float>
   8407   %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8408   %2 = bitcast <4 x double> %extract2.i to <8 x float>
   8409   %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
   8410   %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8411   %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8412   %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
   8413   %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8414   %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
   8415   %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8416   %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
   8417   %vecext.i = extractelement <4 x float> %6, i32 0
   8418   ret float %vecext.i
   8419 }
   8420 
   8421 define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
   8422 ; X86-LABEL: test_mm512_mask_reduce_max_epi32:
   8423 ; X86:       # %bb.0: # %entry
   8424 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8425 ; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
   8426 ; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   8427 ; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   8428 ; X86-NEXT:    vpmaxsd %ymm0, %ymm1, %ymm0
   8429 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8430 ; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8431 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8432 ; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8433 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8434 ; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8435 ; X86-NEXT:    vmovd %xmm0, %eax
   8436 ; X86-NEXT:    vzeroupper
   8437 ; X86-NEXT:    retl
   8438 ;
   8439 ; X64-LABEL: test_mm512_mask_reduce_max_epi32:
   8440 ; X64:       # %bb.0: # %entry
   8441 ; X64-NEXT:    kmovw %edi, %k1
   8442 ; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
   8443 ; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   8444 ; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   8445 ; X64-NEXT:    vpmaxsd %ymm0, %ymm1, %ymm0
   8446 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8447 ; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8448 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8449 ; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8450 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8451 ; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
   8452 ; X64-NEXT:    vmovd %xmm0, %eax
   8453 ; X64-NEXT:    vzeroupper
   8454 ; X64-NEXT:    retq
   8455 entry:
   8456   %0 = bitcast <8 x i64> %__W to <16 x i32>
   8457   %1 = bitcast i16 %__M to <16 x i1>
   8458   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
   8459   %3 = bitcast <16 x i32> %2 to <8 x i64>
   8460   %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8461   %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8462   %4 = bitcast <4 x i64> %extract.i to <8 x i32>
   8463   %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
   8464   %6 = icmp sgt <8 x i32> %4, %5
   8465   %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
   8466   %8 = bitcast <8 x i32> %7 to <4 x i64>
   8467   %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   8468   %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   8469   %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
   8470   %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
   8471   %11 = icmp sgt <4 x i32> %9, %10
   8472   %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
   8473   %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8474   %13 = icmp sgt <4 x i32> %12, %shuffle.i
   8475   %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
   8476   %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8477   %15 = icmp sgt <4 x i32> %14, %shuffle10.i
   8478   %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
   8479   %vecext.i = extractelement <4 x i32> %16, i32 0
   8480   ret i32 %vecext.i
   8481 }
   8482 
   8483 define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
   8484 ; X86-LABEL: test_mm512_mask_reduce_max_epu32:
   8485 ; X86:       # %bb.0: # %entry
   8486 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8487 ; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
   8488 ; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   8489 ; X86-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
   8490 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8491 ; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8492 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8493 ; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8494 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8495 ; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8496 ; X86-NEXT:    vmovd %xmm0, %eax
   8497 ; X86-NEXT:    vzeroupper
   8498 ; X86-NEXT:    retl
   8499 ;
   8500 ; X64-LABEL: test_mm512_mask_reduce_max_epu32:
   8501 ; X64:       # %bb.0: # %entry
   8502 ; X64-NEXT:    kmovw %edi, %k1
   8503 ; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
   8504 ; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   8505 ; X64-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
   8506 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8507 ; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8508 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8509 ; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8510 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8511 ; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
   8512 ; X64-NEXT:    vmovd %xmm0, %eax
   8513 ; X64-NEXT:    vzeroupper
   8514 ; X64-NEXT:    retq
   8515 entry:
   8516   %0 = bitcast <8 x i64> %__W to <16 x i32>
   8517   %1 = bitcast i16 %__M to <16 x i1>
   8518   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
   8519   %3 = bitcast <16 x i32> %2 to <8 x i64>
   8520   %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8521   %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8522   %4 = bitcast <4 x i64> %extract.i to <8 x i32>
   8523   %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
   8524   %6 = icmp ugt <8 x i32> %4, %5
   8525   %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
   8526   %8 = bitcast <8 x i32> %7 to <4 x i64>
   8527   %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   8528   %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   8529   %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
   8530   %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
   8531   %11 = icmp ugt <4 x i32> %9, %10
   8532   %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
   8533   %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8534   %13 = icmp ugt <4 x i32> %12, %shuffle.i
   8535   %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
   8536   %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8537   %15 = icmp ugt <4 x i32> %14, %shuffle9.i
   8538   %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
   8539   %vecext.i = extractelement <4 x i32> %16, i32 0
   8540   ret i32 %vecext.i
   8541 }
   8542 
   8543 define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
   8544 ; X86-LABEL: test_mm512_mask_reduce_max_ps:
   8545 ; X86:       # %bb.0: # %entry
   8546 ; X86-NEXT:    pushl %eax
   8547 ; X86-NEXT:    .cfi_def_cfa_offset 8
   8548 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8549 ; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
   8550 ; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
   8551 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   8552 ; X86-NEXT:    vmaxps %ymm0, %ymm1, %ymm0
   8553 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8554 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8555 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8556 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8557 ; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8558 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8559 ; X86-NEXT:    vmovss %xmm0, (%esp)
   8560 ; X86-NEXT:    flds (%esp)
   8561 ; X86-NEXT:    popl %eax
   8562 ; X86-NEXT:    .cfi_def_cfa_offset 4
   8563 ; X86-NEXT:    vzeroupper
   8564 ; X86-NEXT:    retl
   8565 ;
   8566 ; X64-LABEL: test_mm512_mask_reduce_max_ps:
   8567 ; X64:       # %bb.0: # %entry
   8568 ; X64-NEXT:    kmovw %edi, %k1
   8569 ; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
   8570 ; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
   8571 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   8572 ; X64-NEXT:    vmaxps %ymm0, %ymm1, %ymm0
   8573 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8574 ; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8575 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8576 ; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8577 ; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8578 ; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
   8579 ; X64-NEXT:    vzeroupper
   8580 ; X64-NEXT:    retq
   8581 entry:
   8582   %0 = bitcast i16 %__M to <16 x i1>
   8583   %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
   8584   %2 = bitcast <16 x float> %1 to <8 x double>
   8585   %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8586   %3 = bitcast <4 x double> %extract.i to <8 x float>
   8587   %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8588   %4 = bitcast <4 x double> %extract4.i to <8 x float>
   8589   %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
   8590   %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8591   %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8592   %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
   8593   %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8594   %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
   8595   %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8596   %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
   8597   %vecext.i = extractelement <4 x float> %8, i32 0
   8598   ret float %vecext.i
   8599 }
   8600 
   8601 define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
   8602 ; X86-LABEL: test_mm512_mask_reduce_min_epi32:
   8603 ; X86:       # %bb.0: # %entry
   8604 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8605 ; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
   8606 ; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   8607 ; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   8608 ; X86-NEXT:    vpminsd %ymm0, %ymm1, %ymm0
   8609 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8610 ; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8611 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8612 ; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8613 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8614 ; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8615 ; X86-NEXT:    vmovd %xmm0, %eax
   8616 ; X86-NEXT:    vzeroupper
   8617 ; X86-NEXT:    retl
   8618 ;
   8619 ; X64-LABEL: test_mm512_mask_reduce_min_epi32:
   8620 ; X64:       # %bb.0: # %entry
   8621 ; X64-NEXT:    kmovw %edi, %k1
   8622 ; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
   8623 ; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   8624 ; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   8625 ; X64-NEXT:    vpminsd %ymm0, %ymm1, %ymm0
   8626 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8627 ; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8628 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8629 ; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8630 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8631 ; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
   8632 ; X64-NEXT:    vmovd %xmm0, %eax
   8633 ; X64-NEXT:    vzeroupper
   8634 ; X64-NEXT:    retq
   8635 entry:
   8636   %0 = bitcast <8 x i64> %__W to <16 x i32>
   8637   %1 = bitcast i16 %__M to <16 x i1>
   8638   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
   8639   %3 = bitcast <16 x i32> %2 to <8 x i64>
   8640   %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8641   %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8642   %4 = bitcast <4 x i64> %extract.i to <8 x i32>
   8643   %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
   8644   %6 = icmp slt <8 x i32> %4, %5
   8645   %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
   8646   %8 = bitcast <8 x i32> %7 to <4 x i64>
   8647   %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   8648   %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   8649   %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
   8650   %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
   8651   %11 = icmp slt <4 x i32> %9, %10
   8652   %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
   8653   %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8654   %13 = icmp slt <4 x i32> %12, %shuffle.i
   8655   %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
   8656   %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8657   %15 = icmp slt <4 x i32> %14, %shuffle10.i
   8658   %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
   8659   %vecext.i = extractelement <4 x i32> %16, i32 0
   8660   ret i32 %vecext.i
   8661 }
   8662 
   8663 define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
   8664 ; X86-LABEL: test_mm512_mask_reduce_min_epu32:
   8665 ; X86:       # %bb.0: # %entry
   8666 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8667 ; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
   8668 ; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   8669 ; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   8670 ; X86-NEXT:    vpminud %ymm0, %ymm1, %ymm0
   8671 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8672 ; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8673 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8674 ; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8675 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8676 ; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8677 ; X86-NEXT:    vmovd %xmm0, %eax
   8678 ; X86-NEXT:    vzeroupper
   8679 ; X86-NEXT:    retl
   8680 ;
   8681 ; X64-LABEL: test_mm512_mask_reduce_min_epu32:
   8682 ; X64:       # %bb.0: # %entry
   8683 ; X64-NEXT:    kmovw %edi, %k1
   8684 ; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
   8685 ; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
   8686 ; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
   8687 ; X64-NEXT:    vpminud %ymm0, %ymm1, %ymm0
   8688 ; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   8689 ; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8690 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   8691 ; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8692 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8693 ; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
   8694 ; X64-NEXT:    vmovd %xmm0, %eax
   8695 ; X64-NEXT:    vzeroupper
   8696 ; X64-NEXT:    retq
   8697 entry:
   8698   %0 = bitcast <8 x i64> %__W to <16 x i32>
   8699   %1 = bitcast i16 %__M to <16 x i1>
   8700   %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   8701   %3 = bitcast <16 x i32> %2 to <8 x i64>
   8702   %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8703   %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8704   %4 = bitcast <4 x i64> %extract.i to <8 x i32>
   8705   %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
   8706   %6 = icmp ult <8 x i32> %4, %5
   8707   %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
   8708   %8 = bitcast <8 x i32> %7 to <4 x i64>
   8709   %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   8710   %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   8711   %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
   8712   %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
   8713   %11 = icmp ult <4 x i32> %9, %10
   8714   %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
   8715   %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8716   %13 = icmp ult <4 x i32> %12, %shuffle.i
   8717   %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
   8718   %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8719   %15 = icmp ult <4 x i32> %14, %shuffle10.i
   8720   %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
   8721   %vecext.i = extractelement <4 x i32> %16, i32 0
   8722   ret i32 %vecext.i
   8723 }
   8724 
   8725 define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
   8726 ; X86-LABEL: test_mm512_mask_reduce_min_ps:
   8727 ; X86:       # %bb.0: # %entry
   8728 ; X86-NEXT:    pushl %eax
   8729 ; X86-NEXT:    .cfi_def_cfa_offset 8
   8730 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8731 ; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
   8732 ; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
   8733 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   8734 ; X86-NEXT:    vminps %ymm0, %ymm1, %ymm0
   8735 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8736 ; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8737 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8738 ; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8739 ; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8740 ; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8741 ; X86-NEXT:    vmovss %xmm0, (%esp)
   8742 ; X86-NEXT:    flds (%esp)
   8743 ; X86-NEXT:    popl %eax
   8744 ; X86-NEXT:    .cfi_def_cfa_offset 4
   8745 ; X86-NEXT:    vzeroupper
   8746 ; X86-NEXT:    retl
   8747 ;
   8748 ; X64-LABEL: test_mm512_mask_reduce_min_ps:
   8749 ; X64:       # %bb.0: # %entry
   8750 ; X64-NEXT:    kmovw %edi, %k1
   8751 ; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
   8752 ; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
   8753 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
   8754 ; X64-NEXT:    vminps %ymm0, %ymm1, %ymm0
   8755 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   8756 ; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8757 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   8758 ; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8759 ; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
   8760 ; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
   8761 ; X64-NEXT:    vzeroupper
   8762 ; X64-NEXT:    retq
   8763 entry:
   8764   %0 = bitcast i16 %__M to <16 x i1>
   8765   %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
   8766   %2 = bitcast <16 x float> %1 to <8 x double>
   8767   %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8768   %3 = bitcast <4 x double> %extract.i to <8 x float>
   8769   %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8770   %4 = bitcast <4 x double> %extract4.i to <8 x float>
   8771   %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
   8772   %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8773   %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8774   %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
   8775   %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   8776   %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
   8777   %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   8778   %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
   8779   %vecext.i = extractelement <4 x float> %8, i32 0
   8780   ret float %vecext.i
   8781 }
   8782 
   8783 define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
   8784 ; X86-LABEL: test_mm512_mask_max_pd:
   8785 ; X86:       # %bb.0: # %entry
   8786 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   8787 ; X86-NEXT:    kmovw %eax, %k1
   8788 ; X86-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
   8789 ; X86-NEXT:    retl
   8790 ;
   8791 ; X64-LABEL: test_mm512_mask_max_pd:
   8792 ; X64:       # %bb.0: # %entry
   8793 ; X64-NEXT:    kmovw %edi, %k1
   8794 ; X64-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
   8795 ; X64-NEXT:    retq
   8796 entry:
   8797   %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   8798   %1 = bitcast i8 %__U to <8 x i1>
   8799   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
   8800   ret <8 x double> %2
   8801 }
   8802 
   8803 define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
   8804 ; X86-LABEL: test_mm512_maskz_max_pd:
   8805 ; X86:       # %bb.0: # %entry
   8806 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   8807 ; X86-NEXT:    kmovw %eax, %k1
   8808 ; X86-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   8809 ; X86-NEXT:    retl
   8810 ;
   8811 ; X64-LABEL: test_mm512_maskz_max_pd:
   8812 ; X64:       # %bb.0: # %entry
   8813 ; X64-NEXT:    kmovw %edi, %k1
   8814 ; X64-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   8815 ; X64-NEXT:    retq
   8816 entry:
   8817   %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   8818   %1 = bitcast i8 %__U to <8 x i1>
   8819   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   8820   ret <8 x double> %2
   8821 }
   8822 
   8823 define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
   8824 ; X86-LABEL: test_mm512_mask_max_ps:
   8825 ; X86:       # %bb.0: # %entry
   8826 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8827 ; X86-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
   8828 ; X86-NEXT:    retl
   8829 ;
   8830 ; X64-LABEL: test_mm512_mask_max_ps:
   8831 ; X64:       # %bb.0: # %entry
   8832 ; X64-NEXT:    kmovw %edi, %k1
   8833 ; X64-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
   8834 ; X64-NEXT:    retq
   8835 entry:
   8836   %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   8837   %1 = bitcast i16 %__U to <16 x i1>
   8838   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
   8839   ret <16 x float> %2
   8840 }
   8841 
   8842 define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
   8843 ; X86-LABEL: test_mm512_mask_max_round_pd:
   8844 ; X86:       # %bb.0: # %entry
   8845 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   8846 ; X86-NEXT:    kmovw %eax, %k1
   8847 ; X86-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
   8848 ; X86-NEXT:    retl
   8849 ;
   8850 ; X64-LABEL: test_mm512_mask_max_round_pd:
   8851 ; X64:       # %bb.0: # %entry
   8852 ; X64-NEXT:    kmovw %edi, %k1
   8853 ; X64-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
   8854 ; X64-NEXT:    retq
   8855 entry:
   8856   %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   8857   %1 = bitcast i8 %__U to <8 x i1>
   8858   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
   8859   ret <8 x double> %2
   8860 }
   8861 
   8862 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
   8863 
   8864 define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
   8865 ; X86-LABEL: test_mm512_maskz_max_round_pd:
   8866 ; X86:       # %bb.0: # %entry
   8867 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   8868 ; X86-NEXT:    kmovw %eax, %k1
   8869 ; X86-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   8870 ; X86-NEXT:    retl
   8871 ;
   8872 ; X64-LABEL: test_mm512_maskz_max_round_pd:
   8873 ; X64:       # %bb.0: # %entry
   8874 ; X64-NEXT:    kmovw %edi, %k1
   8875 ; X64-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   8876 ; X64-NEXT:    retq
   8877 entry:
   8878   %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   8879   %1 = bitcast i8 %__U to <8 x i1>
   8880   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   8881   ret <8 x double> %2
   8882 }
   8883 
   8884 define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
   8885 ; CHECK-LABEL: test_mm512_max_round_pd:
   8886 ; CHECK:       # %bb.0: # %entry
   8887 ; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
   8888 ; CHECK-NEXT:    ret{{[l|q]}}
   8889 entry:
   8890   %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   8891   ret <8 x double> %0
   8892 }
   8893 
   8894 define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
   8895 ; X86-LABEL: test_mm512_maskz_max_ps:
   8896 ; X86:       # %bb.0: # %entry
   8897 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8898 ; X86-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
   8899 ; X86-NEXT:    retl
   8900 ;
   8901 ; X64-LABEL: test_mm512_maskz_max_ps:
   8902 ; X64:       # %bb.0: # %entry
   8903 ; X64-NEXT:    kmovw %edi, %k1
   8904 ; X64-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
   8905 ; X64-NEXT:    retq
   8906 entry:
   8907   %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   8908   %1 = bitcast i16 %__U to <16 x i1>
   8909   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   8910   ret <16 x float> %2
   8911 }
   8912 
   8913 define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
   8914 ; X86-LABEL: test_mm512_mask_max_round_ps:
   8915 ; X86:       # %bb.0: # %entry
   8916 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8917 ; X86-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
   8918 ; X86-NEXT:    retl
   8919 ;
   8920 ; X64-LABEL: test_mm512_mask_max_round_ps:
   8921 ; X64:       # %bb.0: # %entry
   8922 ; X64-NEXT:    kmovw %edi, %k1
   8923 ; X64-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
   8924 ; X64-NEXT:    retq
   8925 entry:
   8926   %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   8927   %1 = bitcast i16 %__U to <16 x i1>
   8928   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
   8929   ret <16 x float> %2
   8930 }
   8931 
   8932 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
   8933 
   8934 define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
   8935 ; X86-LABEL: test_mm512_maskz_max_round_ps:
   8936 ; X86:       # %bb.0: # %entry
   8937 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   8938 ; X86-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
   8939 ; X86-NEXT:    retl
   8940 ;
   8941 ; X64-LABEL: test_mm512_maskz_max_round_ps:
   8942 ; X64:       # %bb.0: # %entry
   8943 ; X64-NEXT:    kmovw %edi, %k1
   8944 ; X64-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
   8945 ; X64-NEXT:    retq
   8946 entry:
   8947   %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   8948   %1 = bitcast i16 %__U to <16 x i1>
   8949   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   8950   ret <16 x float> %2
   8951 }
   8952 
   8953 define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
   8954 ; CHECK-LABEL: test_mm512_max_round_ps:
   8955 ; CHECK:       # %bb.0: # %entry
   8956 ; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
   8957 ; CHECK-NEXT:    ret{{[l|q]}}
   8958 entry:
   8959   %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   8960   ret <16 x float> %0
   8961 }
   8962 
   8963 define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
   8964 ; X86-LABEL: test_mm512_mask_min_pd:
   8965 ; X86:       # %bb.0: # %entry
   8966 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   8967 ; X86-NEXT:    kmovw %eax, %k1
   8968 ; X86-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
   8969 ; X86-NEXT:    retl
   8970 ;
   8971 ; X64-LABEL: test_mm512_mask_min_pd:
   8972 ; X64:       # %bb.0: # %entry
   8973 ; X64-NEXT:    kmovw %edi, %k1
   8974 ; X64-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
   8975 ; X64-NEXT:    retq
   8976 entry:
   8977   %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   8978   %1 = bitcast i8 %__U to <8 x i1>
   8979   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
   8980   ret <8 x double> %2
   8981 }
   8982 
   8983 define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
   8984 ; X86-LABEL: test_mm512_maskz_min_pd:
   8985 ; X86:       # %bb.0: # %entry
   8986 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   8987 ; X86-NEXT:    kmovw %eax, %k1
   8988 ; X86-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   8989 ; X86-NEXT:    retl
   8990 ;
   8991 ; X64-LABEL: test_mm512_maskz_min_pd:
   8992 ; X64:       # %bb.0: # %entry
   8993 ; X64-NEXT:    kmovw %edi, %k1
   8994 ; X64-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   8995 ; X64-NEXT:    retq
   8996 entry:
   8997   %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   8998   %1 = bitcast i8 %__U to <8 x i1>
   8999   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   9000   ret <8 x double> %2
   9001 }
   9002 
   9003 define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
   9004 ; X86-LABEL: test_mm512_mask_min_round_pd:
   9005 ; X86:       # %bb.0: # %entry
   9006 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9007 ; X86-NEXT:    kmovw %eax, %k1
   9008 ; X86-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
   9009 ; X86-NEXT:    retl
   9010 ;
   9011 ; X64-LABEL: test_mm512_mask_min_round_pd:
   9012 ; X64:       # %bb.0: # %entry
   9013 ; X64-NEXT:    kmovw %edi, %k1
   9014 ; X64-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
   9015 ; X64-NEXT:    retq
   9016 entry:
   9017   %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   9018   %1 = bitcast i8 %__U to <8 x i1>
   9019   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
   9020   ret <8 x double> %2
   9021 }
   9022 
   9023 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
   9024 
   9025 define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
   9026 ; X86-LABEL: test_mm512_maskz_min_round_pd:
   9027 ; X86:       # %bb.0: # %entry
   9028 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9029 ; X86-NEXT:    kmovw %eax, %k1
   9030 ; X86-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   9031 ; X86-NEXT:    retl
   9032 ;
   9033 ; X64-LABEL: test_mm512_maskz_min_round_pd:
   9034 ; X64:       # %bb.0: # %entry
   9035 ; X64-NEXT:    kmovw %edi, %k1
   9036 ; X64-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
   9037 ; X64-NEXT:    retq
   9038 entry:
   9039   %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   9040   %1 = bitcast i8 %__U to <8 x i1>
   9041   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   9042   ret <8 x double> %2
   9043 }
   9044 
   9045 define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
   9046 ; CHECK-LABEL: test_mm512_min_round_pd:
   9047 ; CHECK:       # %bb.0: # %entry
   9048 ; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
   9049 ; CHECK-NEXT:    ret{{[l|q]}}
   9050 entry:
   9051   %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
   9052   ret <8 x double> %0
   9053 }
   9054 
   9055 define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
   9056 ; X86-LABEL: test_mm512_mask_min_ps:
   9057 ; X86:       # %bb.0: # %entry
   9058 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9059 ; X86-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
   9060 ; X86-NEXT:    retl
   9061 ;
   9062 ; X64-LABEL: test_mm512_mask_min_ps:
   9063 ; X64:       # %bb.0: # %entry
   9064 ; X64-NEXT:    kmovw %edi, %k1
   9065 ; X64-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
   9066 ; X64-NEXT:    retq
   9067 entry:
   9068   %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   9069   %1 = bitcast i16 %__U to <16 x i1>
   9070   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
   9071   ret <16 x float> %2
   9072 }
   9073 
   9074 define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
   9075 ; X86-LABEL: test_mm512_maskz_min_ps:
   9076 ; X86:       # %bb.0: # %entry
   9077 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9078 ; X86-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
   9079 ; X86-NEXT:    retl
   9080 ;
   9081 ; X64-LABEL: test_mm512_maskz_min_ps:
   9082 ; X64:       # %bb.0: # %entry
   9083 ; X64-NEXT:    kmovw %edi, %k1
   9084 ; X64-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
   9085 ; X64-NEXT:    retq
   9086 entry:
   9087   %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   9088   %1 = bitcast i16 %__U to <16 x i1>
   9089   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   9090   ret <16 x float> %2
   9091 }
   9092 
   9093 define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
   9094 ; X86-LABEL: test_mm512_mask_min_round_ps:
   9095 ; X86:       # %bb.0: # %entry
   9096 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9097 ; X86-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
   9098 ; X86-NEXT:    retl
   9099 ;
   9100 ; X64-LABEL: test_mm512_mask_min_round_ps:
   9101 ; X64:       # %bb.0: # %entry
   9102 ; X64-NEXT:    kmovw %edi, %k1
   9103 ; X64-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
   9104 ; X64-NEXT:    retq
   9105 entry:
   9106   %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   9107   %1 = bitcast i16 %__U to <16 x i1>
   9108   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
   9109   ret <16 x float> %2
   9110 }
   9111 
   9112 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
   9113 
   9114 define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
   9115 ; X86-LABEL: test_mm512_maskz_min_round_ps:
   9116 ; X86:       # %bb.0: # %entry
   9117 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9118 ; X86-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
   9119 ; X86-NEXT:    retl
   9120 ;
   9121 ; X64-LABEL: test_mm512_maskz_min_round_ps:
   9122 ; X64:       # %bb.0: # %entry
   9123 ; X64-NEXT:    kmovw %edi, %k1
   9124 ; X64-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
   9125 ; X64-NEXT:    retq
   9126 entry:
   9127   %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   9128   %1 = bitcast i16 %__U to <16 x i1>
   9129   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   9130   ret <16 x float> %2
   9131 }
   9132 
   9133 define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
   9134 ; CHECK-LABEL: test_mm512_min_round_ps:
   9135 ; CHECK:       # %bb.0: # %entry
   9136 ; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
   9137 ; CHECK-NEXT:    ret{{[l|q]}}
   9138 entry:
   9139   %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
   9140   ret <16 x float> %0
   9141 }
   9142 
   9143 define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
   9144 ; CHECK-LABEL: test_mm512_sqrt_pd:
   9145 ; CHECK:       # %bb.0: # %entry
   9146 ; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
   9147 ; CHECK-NEXT:    ret{{[l|q]}}
   9148 entry:
   9149   %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
   9150   ret <8 x double> %0
   9151 }
   9152 
   9153 define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
   9154 ; X86-LABEL: test_mm512_mask_sqrt_pd:
   9155 ; X86:       # %bb.0: # %entry
   9156 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9157 ; X86-NEXT:    kmovw %eax, %k1
   9158 ; X86-NEXT:    vsqrtpd %zmm1, %zmm0 {%k1}
   9159 ; X86-NEXT:    retl
   9160 ;
   9161 ; X64-LABEL: test_mm512_mask_sqrt_pd:
   9162 ; X64:       # %bb.0: # %entry
   9163 ; X64-NEXT:    kmovw %edi, %k1
   9164 ; X64-NEXT:    vsqrtpd %zmm1, %zmm0 {%k1}
   9165 ; X64-NEXT:    retq
   9166 entry:
   9167   %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
   9168   %1 = bitcast i8 %__U to <8 x i1>
   9169   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
   9170   ret <8 x double> %2
   9171 }
   9172 
   9173 define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
   9174 ; X86-LABEL: test_mm512_maskz_sqrt_pd:
   9175 ; X86:       # %bb.0: # %entry
   9176 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9177 ; X86-NEXT:    kmovw %eax, %k1
   9178 ; X86-NEXT:    vsqrtpd %zmm0, %zmm0 {%k1} {z}
   9179 ; X86-NEXT:    retl
   9180 ;
   9181 ; X64-LABEL: test_mm512_maskz_sqrt_pd:
   9182 ; X64:       # %bb.0: # %entry
   9183 ; X64-NEXT:    kmovw %edi, %k1
   9184 ; X64-NEXT:    vsqrtpd %zmm0, %zmm0 {%k1} {z}
   9185 ; X64-NEXT:    retq
   9186 entry:
   9187   %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
   9188   %1 = bitcast i8 %__U to <8 x i1>
   9189   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   9190   ret <8 x double> %2
   9191 }
   9192 
   9193 define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
   9194 ; X86-LABEL: test_mm512_mask_sqrt_round_pd:
   9195 ; X86:       # %bb.0: # %entry
   9196 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9197 ; X86-NEXT:    kmovw %eax, %k1
   9198 ; X86-NEXT:    vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
   9199 ; X86-NEXT:    retl
   9200 ;
   9201 ; X64-LABEL: test_mm512_mask_sqrt_round_pd:
   9202 ; X64:       # %bb.0: # %entry
   9203 ; X64-NEXT:    kmovw %edi, %k1
   9204 ; X64-NEXT:    vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
   9205 ; X64-NEXT:    retq
   9206 entry:
   9207   %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
   9208   %1 = bitcast i8 %__U to <8 x i1>
   9209   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
   9210   ret <8 x double> %2
   9211 }
   9212 
   9213 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
   9214 
   9215 define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
   9216 ; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
   9217 ; X86:       # %bb.0: # %entry
   9218 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9219 ; X86-NEXT:    kmovw %eax, %k1
   9220 ; X86-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
   9221 ; X86-NEXT:    retl
   9222 ;
   9223 ; X64-LABEL: test_mm512_maskz_sqrt_round_pd:
   9224 ; X64:       # %bb.0: # %entry
   9225 ; X64-NEXT:    kmovw %edi, %k1
   9226 ; X64-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
   9227 ; X64-NEXT:    retq
   9228 entry:
   9229   %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
   9230   %1 = bitcast i8 %__U to <8 x i1>
   9231   %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
   9232   ret <8 x double> %2
   9233 }
   9234 
   9235 define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
   9236 ; CHECK-LABEL: test_mm512_sqrt_round_pd:
   9237 ; CHECK:       # %bb.0: # %entry
   9238 ; CHECK-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0
   9239 ; CHECK-NEXT:    ret{{[l|q]}}
   9240 entry:
   9241   %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
   9242   ret <8 x double> %0
   9243 }
   9244 
   9245 define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
   9246 ; CHECK-LABEL: test_mm512_sqrt_ps:
   9247 ; CHECK:       # %bb.0: # %entry
   9248 ; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
   9249 ; CHECK-NEXT:    ret{{[l|q]}}
   9250 entry:
   9251   %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
   9252   ret <16 x float> %0
   9253 }
   9254 
   9255 define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
   9256 ; X86-LABEL: test_mm512_mask_sqrt_ps:
   9257 ; X86:       # %bb.0: # %entry
   9258 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9259 ; X86-NEXT:    vsqrtps %zmm1, %zmm0 {%k1}
   9260 ; X86-NEXT:    retl
   9261 ;
   9262 ; X64-LABEL: test_mm512_mask_sqrt_ps:
   9263 ; X64:       # %bb.0: # %entry
   9264 ; X64-NEXT:    kmovw %edi, %k1
   9265 ; X64-NEXT:    vsqrtps %zmm1, %zmm0 {%k1}
   9266 ; X64-NEXT:    retq
   9267 entry:
   9268   %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
   9269   %1 = bitcast i16 %__U to <16 x i1>
   9270   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
   9271   ret <16 x float> %2
   9272 }
   9273 
   9274 define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
   9275 ; X86-LABEL: test_mm512_maskz_sqrt_ps:
   9276 ; X86:       # %bb.0: # %entry
   9277 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9278 ; X86-NEXT:    vsqrtps %zmm0, %zmm0 {%k1} {z}
   9279 ; X86-NEXT:    retl
   9280 ;
   9281 ; X64-LABEL: test_mm512_maskz_sqrt_ps:
   9282 ; X64:       # %bb.0: # %entry
   9283 ; X64-NEXT:    kmovw %edi, %k1
   9284 ; X64-NEXT:    vsqrtps %zmm0, %zmm0 {%k1} {z}
   9285 ; X64-NEXT:    retq
   9286 entry:
   9287   %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
   9288   %1 = bitcast i16 %__U to <16 x i1>
   9289   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   9290   ret <16 x float> %2
   9291 }
   9292 
   9293 define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
   9294 ; X86-LABEL: test_mm512_mask_sqrt_round_ps:
   9295 ; X86:       # %bb.0: # %entry
   9296 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9297 ; X86-NEXT:    vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
   9298 ; X86-NEXT:    retl
   9299 ;
   9300 ; X64-LABEL: test_mm512_mask_sqrt_round_ps:
   9301 ; X64:       # %bb.0: # %entry
   9302 ; X64-NEXT:    kmovw %edi, %k1
   9303 ; X64-NEXT:    vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
   9304 ; X64-NEXT:    retq
   9305 entry:
   9306   %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
   9307   %1 = bitcast i16 %__U to <16 x i1>
   9308   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
   9309   ret <16 x float> %2
   9310 }
   9311 
   9312 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
   9313 
   9314 define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
   9315 ; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
   9316 ; X86:       # %bb.0: # %entry
   9317 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9318 ; X86-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
   9319 ; X86-NEXT:    retl
   9320 ;
   9321 ; X64-LABEL: test_mm512_maskz_sqrt_round_ps:
   9322 ; X64:       # %bb.0: # %entry
   9323 ; X64-NEXT:    kmovw %edi, %k1
   9324 ; X64-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
   9325 ; X64-NEXT:    retq
   9326 entry:
   9327   %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
   9328   %1 = bitcast i16 %__U to <16 x i1>
   9329   %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
   9330   ret <16 x float> %2
   9331 }
   9332 
   9333 define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
   9334 ; CHECK-LABEL: test_mm512_sqrt_round_ps:
   9335 ; CHECK:       # %bb.0: # %entry
   9336 ; CHECK-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0
   9337 ; CHECK-NEXT:    ret{{[l|q]}}
   9338 entry:
   9339   %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
   9340   ret <16 x float> %0
   9341 }
   9342 
   9343 define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
   9344 ; CHECK-LABEL: test_mm512_rol_epi32:
   9345 ; CHECK:       # %bb.0: # %entry
   9346 ; CHECK-NEXT:    vprold $5, %zmm0, %zmm0
   9347 ; CHECK-NEXT:    ret{{[l|q]}}
   9348 entry:
   9349   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9350   %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5)
   9351   %2 = bitcast <16 x i32> %1 to <8 x i64>
   9352   ret <8 x i64> %2
   9353 }
   9354 
   9355 declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32) #1
   9356 
   9357 define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
   9358 ; X86-LABEL: test_mm512_mask_rol_epi32:
   9359 ; X86:       # %bb.0: # %entry
   9360 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9361 ; X86-NEXT:    vprold $5, %zmm1, %zmm0 {%k1}
   9362 ; X86-NEXT:    retl
   9363 ;
   9364 ; X64-LABEL: test_mm512_mask_rol_epi32:
   9365 ; X64:       # %bb.0: # %entry
   9366 ; X64-NEXT:    kmovw %edi, %k1
   9367 ; X64-NEXT:    vprold $5, %zmm1, %zmm0 {%k1}
   9368 ; X64-NEXT:    retq
   9369 entry:
   9370   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9371   %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5)
   9372   %2 = bitcast <8 x i64> %__W to <16 x i32>
   9373   %3 = bitcast i16 %__U to <16 x i1>
   9374   %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
   9375   %5 = bitcast <16 x i32> %4 to <8 x i64>
   9376   ret <8 x i64> %5
   9377 }
   9378 
   9379 define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
   9380 ; X86-LABEL: test_mm512_maskz_rol_epi32:
   9381 ; X86:       # %bb.0: # %entry
   9382 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9383 ; X86-NEXT:    vprold $5, %zmm0, %zmm0 {%k1} {z}
   9384 ; X86-NEXT:    retl
   9385 ;
   9386 ; X64-LABEL: test_mm512_maskz_rol_epi32:
   9387 ; X64:       # %bb.0: # %entry
   9388 ; X64-NEXT:    kmovw %edi, %k1
   9389 ; X64-NEXT:    vprold $5, %zmm0, %zmm0 {%k1} {z}
   9390 ; X64-NEXT:    retq
   9391 entry:
   9392   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9393   %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5)
   9394   %2 = bitcast i16 %__U to <16 x i1>
   9395   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
   9396   %4 = bitcast <16 x i32> %3 to <8 x i64>
   9397   ret <8 x i64> %4
   9398 }
   9399 
   9400 define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
   9401 ; CHECK-LABEL: test_mm512_rol_epi64:
   9402 ; CHECK:       # %bb.0: # %entry
   9403 ; CHECK-NEXT:    vprolq $5, %zmm0, %zmm0
   9404 ; CHECK-NEXT:    ret{{[l|q]}}
   9405 entry:
   9406   %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5)
   9407   ret <8 x i64> %0
   9408 }
   9409 
   9410 declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32) #1
   9411 
   9412 define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
   9413 ; X86-LABEL: test_mm512_mask_rol_epi64:
   9414 ; X86:       # %bb.0: # %entry
   9415 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9416 ; X86-NEXT:    kmovw %eax, %k1
   9417 ; X86-NEXT:    vprolq $5, %zmm1, %zmm0 {%k1}
   9418 ; X86-NEXT:    retl
   9419 ;
   9420 ; X64-LABEL: test_mm512_mask_rol_epi64:
   9421 ; X64:       # %bb.0: # %entry
   9422 ; X64-NEXT:    kmovw %edi, %k1
   9423 ; X64-NEXT:    vprolq $5, %zmm1, %zmm0 {%k1}
   9424 ; X64-NEXT:    retq
   9425 entry:
   9426   %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5)
   9427   %1 = bitcast i8 %__U to <8 x i1>
   9428   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
   9429   ret <8 x i64> %2
   9430 }
   9431 
   9432 define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
   9433 ; X86-LABEL: test_mm512_maskz_rol_epi64:
   9434 ; X86:       # %bb.0: # %entry
   9435 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9436 ; X86-NEXT:    kmovw %eax, %k1
   9437 ; X86-NEXT:    vprolq $5, %zmm0, %zmm0 {%k1} {z}
   9438 ; X86-NEXT:    retl
   9439 ;
   9440 ; X64-LABEL: test_mm512_maskz_rol_epi64:
   9441 ; X64:       # %bb.0: # %entry
   9442 ; X64-NEXT:    kmovw %edi, %k1
   9443 ; X64-NEXT:    vprolq $5, %zmm0, %zmm0 {%k1} {z}
   9444 ; X64-NEXT:    retq
   9445 entry:
   9446   %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5)
   9447   %1 = bitcast i8 %__U to <8 x i1>
   9448   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
   9449   ret <8 x i64> %2
   9450 }
   9451 
   9452 define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
   9453 ; CHECK-LABEL: test_mm512_rolv_epi32:
   9454 ; CHECK:       # %bb.0: # %entry
   9455 ; CHECK-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
   9456 ; CHECK-NEXT:    ret{{[l|q]}}
   9457 entry:
   9458   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9459   %1 = bitcast <8 x i64> %__B to <16 x i32>
   9460   %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1)
   9461   %3 = bitcast <16 x i32> %2 to <8 x i64>
   9462   ret <8 x i64> %3
   9463 }
   9464 
   9465 define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
   9466 ; X86-LABEL: test_mm512_mask_rolv_epi32:
   9467 ; X86:       # %bb.0: # %entry
   9468 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9469 ; X86-NEXT:    vprolvd %zmm2, %zmm1, %zmm0 {%k1}
   9470 ; X86-NEXT:    retl
   9471 ;
   9472 ; X64-LABEL: test_mm512_mask_rolv_epi32:
   9473 ; X64:       # %bb.0: # %entry
   9474 ; X64-NEXT:    kmovw %edi, %k1
   9475 ; X64-NEXT:    vprolvd %zmm2, %zmm1, %zmm0 {%k1}
   9476 ; X64-NEXT:    retq
   9477 entry:
   9478   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9479   %1 = bitcast <8 x i64> %__B to <16 x i32>
   9480   %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1)
   9481   %3 = bitcast <8 x i64> %__W to <16 x i32>
   9482   %4 = bitcast i16 %__U to <16 x i1>
   9483   %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
   9484   %6 = bitcast <16 x i32> %5 to <8 x i64>
   9485   ret <8 x i64> %6
   9486 }
   9487 
   9488 define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
   9489 ; X86-LABEL: test_mm512_maskz_rolv_epi32:
   9490 ; X86:       # %bb.0: # %entry
   9491 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9492 ; X86-NEXT:    vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   9493 ; X86-NEXT:    retl
   9494 ;
   9495 ; X64-LABEL: test_mm512_maskz_rolv_epi32:
   9496 ; X64:       # %bb.0: # %entry
   9497 ; X64-NEXT:    kmovw %edi, %k1
   9498 ; X64-NEXT:    vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   9499 ; X64-NEXT:    retq
   9500 entry:
   9501   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9502   %1 = bitcast <8 x i64> %__B to <16 x i32>
   9503   %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1)
   9504   %3 = bitcast i16 %__U to <16 x i1>
   9505   %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
   9506   %5 = bitcast <16 x i32> %4 to <8 x i64>
   9507   ret <8 x i64> %5
   9508 }
   9509 
   9510 define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
   9511 ; CHECK-LABEL: test_mm512_rolv_epi64:
   9512 ; CHECK:       # %bb.0: # %entry
   9513 ; CHECK-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
   9514 ; CHECK-NEXT:    ret{{[l|q]}}
   9515 entry:
   9516   %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B)
   9517   ret <8 x i64> %0
   9518 }
   9519 
   9520 define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
   9521 ; X86-LABEL: test_mm512_mask_rolv_epi64:
   9522 ; X86:       # %bb.0: # %entry
   9523 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9524 ; X86-NEXT:    kmovw %eax, %k1
   9525 ; X86-NEXT:    vprolvq %zmm2, %zmm1, %zmm0 {%k1}
   9526 ; X86-NEXT:    retl
   9527 ;
   9528 ; X64-LABEL: test_mm512_mask_rolv_epi64:
   9529 ; X64:       # %bb.0: # %entry
   9530 ; X64-NEXT:    kmovw %edi, %k1
   9531 ; X64-NEXT:    vprolvq %zmm2, %zmm1, %zmm0 {%k1}
   9532 ; X64-NEXT:    retq
   9533 entry:
   9534   %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B)
   9535   %1 = bitcast i8 %__U to <8 x i1>
   9536   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
   9537   ret <8 x i64> %2
   9538 }
   9539 
   9540 define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
   9541 ; X86-LABEL: test_mm512_maskz_rolv_epi64:
   9542 ; X86:       # %bb.0: # %entry
   9543 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9544 ; X86-NEXT:    kmovw %eax, %k1
   9545 ; X86-NEXT:    vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   9546 ; X86-NEXT:    retl
   9547 ;
   9548 ; X64-LABEL: test_mm512_maskz_rolv_epi64:
   9549 ; X64:       # %bb.0: # %entry
   9550 ; X64-NEXT:    kmovw %edi, %k1
   9551 ; X64-NEXT:    vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   9552 ; X64-NEXT:    retq
   9553 entry:
   9554   %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B)
   9555   %1 = bitcast i8 %__U to <8 x i1>
   9556   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
   9557   ret <8 x i64> %2
   9558 }
   9559 
   9560 define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
   9561 ; CHECK-LABEL: test_mm512_ror_epi32:
   9562 ; CHECK:       # %bb.0: # %entry
   9563 ; CHECK-NEXT:    vprord $5, %zmm0, %zmm0
   9564 ; CHECK-NEXT:    ret{{[l|q]}}
   9565 entry:
   9566   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9567   %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5)
   9568   %2 = bitcast <16 x i32> %1 to <8 x i64>
   9569   ret <8 x i64> %2
   9570 }
   9571 
   9572 declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32) #1
   9573 
   9574 define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
   9575 ; X86-LABEL: test_mm512_mask_ror_epi32:
   9576 ; X86:       # %bb.0: # %entry
   9577 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9578 ; X86-NEXT:    vprord $5, %zmm1, %zmm0 {%k1}
   9579 ; X86-NEXT:    retl
   9580 ;
   9581 ; X64-LABEL: test_mm512_mask_ror_epi32:
   9582 ; X64:       # %bb.0: # %entry
   9583 ; X64-NEXT:    kmovw %edi, %k1
   9584 ; X64-NEXT:    vprord $5, %zmm1, %zmm0 {%k1}
   9585 ; X64-NEXT:    retq
   9586 entry:
   9587   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9588   %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5)
   9589   %2 = bitcast <8 x i64> %__W to <16 x i32>
   9590   %3 = bitcast i16 %__U to <16 x i1>
   9591   %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
   9592   %5 = bitcast <16 x i32> %4 to <8 x i64>
   9593   ret <8 x i64> %5
   9594 }
   9595 
   9596 define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
   9597 ; X86-LABEL: test_mm512_maskz_ror_epi32:
   9598 ; X86:       # %bb.0: # %entry
   9599 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9600 ; X86-NEXT:    vprord $5, %zmm0, %zmm0 {%k1} {z}
   9601 ; X86-NEXT:    retl
   9602 ;
   9603 ; X64-LABEL: test_mm512_maskz_ror_epi32:
   9604 ; X64:       # %bb.0: # %entry
   9605 ; X64-NEXT:    kmovw %edi, %k1
   9606 ; X64-NEXT:    vprord $5, %zmm0, %zmm0 {%k1} {z}
   9607 ; X64-NEXT:    retq
   9608 entry:
   9609   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9610   %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5)
   9611   %2 = bitcast i16 %__U to <16 x i1>
   9612   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
   9613   %4 = bitcast <16 x i32> %3 to <8 x i64>
   9614   ret <8 x i64> %4
   9615 }
   9616 
   9617 define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
   9618 ; CHECK-LABEL: test_mm512_ror_epi64:
   9619 ; CHECK:       # %bb.0: # %entry
   9620 ; CHECK-NEXT:    vprorq $5, %zmm0, %zmm0
   9621 ; CHECK-NEXT:    ret{{[l|q]}}
   9622 entry:
   9623   %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5)
   9624   ret <8 x i64> %0
   9625 }
   9626 
   9627 declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32) #1
   9628 
   9629 define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
   9630 ; X86-LABEL: test_mm512_mask_ror_epi64:
   9631 ; X86:       # %bb.0: # %entry
   9632 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9633 ; X86-NEXT:    kmovw %eax, %k1
   9634 ; X86-NEXT:    vprorq $5, %zmm1, %zmm0 {%k1}
   9635 ; X86-NEXT:    retl
   9636 ;
   9637 ; X64-LABEL: test_mm512_mask_ror_epi64:
   9638 ; X64:       # %bb.0: # %entry
   9639 ; X64-NEXT:    kmovw %edi, %k1
   9640 ; X64-NEXT:    vprorq $5, %zmm1, %zmm0 {%k1}
   9641 ; X64-NEXT:    retq
   9642 entry:
   9643   %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5)
   9644   %1 = bitcast i8 %__U to <8 x i1>
   9645   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
   9646   ret <8 x i64> %2
   9647 }
   9648 
   9649 define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
   9650 ; X86-LABEL: test_mm512_maskz_ror_epi64:
   9651 ; X86:       # %bb.0: # %entry
   9652 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9653 ; X86-NEXT:    kmovw %eax, %k1
   9654 ; X86-NEXT:    vprorq $5, %zmm0, %zmm0 {%k1} {z}
   9655 ; X86-NEXT:    retl
   9656 ;
   9657 ; X64-LABEL: test_mm512_maskz_ror_epi64:
   9658 ; X64:       # %bb.0: # %entry
   9659 ; X64-NEXT:    kmovw %edi, %k1
   9660 ; X64-NEXT:    vprorq $5, %zmm0, %zmm0 {%k1} {z}
   9661 ; X64-NEXT:    retq
   9662 entry:
   9663   %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5)
   9664   %1 = bitcast i8 %__U to <8 x i1>
   9665   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
   9666   ret <8 x i64> %2
   9667 }
   9668 
   9669 define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
   9670 ; CHECK-LABEL: test_mm512_rorv_epi32:
   9671 ; CHECK:       # %bb.0: # %entry
   9672 ; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
   9673 ; CHECK-NEXT:    ret{{[l|q]}}
   9674 entry:
   9675   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9676   %1 = bitcast <8 x i64> %__B to <16 x i32>
   9677   %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1)
   9678   %3 = bitcast <16 x i32> %2 to <8 x i64>
   9679   ret <8 x i64> %3
   9680 }
   9681 
   9682 define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
   9683 ; X86-LABEL: test_mm512_mask_rorv_epi32:
   9684 ; X86:       # %bb.0: # %entry
   9685 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9686 ; X86-NEXT:    vprorvd %zmm2, %zmm1, %zmm0 {%k1}
   9687 ; X86-NEXT:    retl
   9688 ;
   9689 ; X64-LABEL: test_mm512_mask_rorv_epi32:
   9690 ; X64:       # %bb.0: # %entry
   9691 ; X64-NEXT:    kmovw %edi, %k1
   9692 ; X64-NEXT:    vprorvd %zmm2, %zmm1, %zmm0 {%k1}
   9693 ; X64-NEXT:    retq
   9694 entry:
   9695   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9696   %1 = bitcast <8 x i64> %__B to <16 x i32>
   9697   %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1)
   9698   %3 = bitcast <8 x i64> %__W to <16 x i32>
   9699   %4 = bitcast i16 %__U to <16 x i1>
   9700   %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
   9701   %6 = bitcast <16 x i32> %5 to <8 x i64>
   9702   ret <8 x i64> %6
   9703 }
   9704 
   9705 define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
   9706 ; X86-LABEL: test_mm512_maskz_rorv_epi32:
   9707 ; X86:       # %bb.0: # %entry
   9708 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
   9709 ; X86-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   9710 ; X86-NEXT:    retl
   9711 ;
   9712 ; X64-LABEL: test_mm512_maskz_rorv_epi32:
   9713 ; X64:       # %bb.0: # %entry
   9714 ; X64-NEXT:    kmovw %edi, %k1
   9715 ; X64-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
   9716 ; X64-NEXT:    retq
   9717 entry:
   9718   %0 = bitcast <8 x i64> %__A to <16 x i32>
   9719   %1 = bitcast <8 x i64> %__B to <16 x i32>
   9720   %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1)
   9721   %3 = bitcast i16 %__U to <16 x i1>
   9722   %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
   9723   %5 = bitcast <16 x i32> %4 to <8 x i64>
   9724   ret <8 x i64> %5
   9725 }
   9726 
   9727 define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
   9728 ; CHECK-LABEL: test_mm512_rorv_epi64:
   9729 ; CHECK:       # %bb.0: # %entry
   9730 ; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
   9731 ; CHECK-NEXT:    ret{{[l|q]}}
   9732 entry:
   9733   %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B)
   9734   ret <8 x i64> %0
   9735 }
   9736 
   9737 define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
   9738 ; X86-LABEL: test_mm512_mask_rorv_epi64:
   9739 ; X86:       # %bb.0: # %entry
   9740 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9741 ; X86-NEXT:    kmovw %eax, %k1
   9742 ; X86-NEXT:    vprorvq %zmm2, %zmm1, %zmm0 {%k1}
   9743 ; X86-NEXT:    retl
   9744 ;
   9745 ; X64-LABEL: test_mm512_mask_rorv_epi64:
   9746 ; X64:       # %bb.0: # %entry
   9747 ; X64-NEXT:    kmovw %edi, %k1
   9748 ; X64-NEXT:    vprorvq %zmm2, %zmm1, %zmm0 {%k1}
   9749 ; X64-NEXT:    retq
   9750 entry:
   9751   %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B)
   9752   %1 = bitcast i8 %__U to <8 x i1>
   9753   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
   9754   ret <8 x i64> %2
   9755 }
   9756 
   9757 define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
   9758 ; X86-LABEL: test_mm512_maskz_rorv_epi64:
   9759 ; X86:       # %bb.0: # %entry
   9760 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
   9761 ; X86-NEXT:    kmovw %eax, %k1
   9762 ; X86-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   9763 ; X86-NEXT:    retl
   9764 ;
   9765 ; X64-LABEL: test_mm512_maskz_rorv_epi64:
   9766 ; X64:       # %bb.0: # %entry
   9767 ; X64-NEXT:    kmovw %edi, %k1
   9768 ; X64-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
   9769 ; X64-NEXT:    retq
   9770 entry:
   9771   %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B)
   9772   %1 = bitcast i8 %__U to <8 x i1>
   9773   %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
   9774   ret <8 x i64> %2
   9775 }
   9776 
   9777 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
   9778 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
   9779 declare float @llvm.fma.f32(float, float, float) #9
   9780 declare double @llvm.fma.f64(double, double, double) #9
   9781 declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
   9782 declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
   9783 declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
   9784 declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
   9785 declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
   9786 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
   9787 declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
   9788 declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
   9789 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
   9790 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
   9791 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
   9792 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
   9793 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
   9794 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
   9795 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
   9796 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
   9797 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
   9798 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
   9799 declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>)
   9800 declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>)
   9801 declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>)
   9802 declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>)
   9803 
   9804 !0 = !{i32 1}
   9805 
   9806