Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
      3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
      4 
      5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
      6 
      7 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
      8 ; X32-LABEL: test_mm_broadcastd_epi32:
      9 ; X32:       # BB#0:
     10 ; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
     11 ; X32-NEXT:    retl
     12 ;
     13 ; X64-LABEL: test_mm_broadcastd_epi32:
     14 ; X64:       # BB#0:
     15 ; X64-NEXT:    vpbroadcastd %xmm0, %xmm0
     16 ; X64-NEXT:    retq
     17   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
     18   %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
     19   %res1 = bitcast <4 x i32> %res0 to <2 x i64>
     20   ret <2 x i64> %res1
     21 }
     22 
     23 define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
     24 ; X32-LABEL: test_mm_mask_broadcastd_epi32:
     25 ; X32:       # BB#0:
     26 ; X32-NEXT:    pushl %eax
     27 ; X32-NEXT:  .Ltmp0:
     28 ; X32-NEXT:    .cfi_def_cfa_offset 8
     29 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
     30 ; X32-NEXT:    andb $15, %al
     31 ; X32-NEXT:    movb %al, (%esp)
     32 ; X32-NEXT:    movzbl (%esp), %eax
     33 ; X32-NEXT:    kmovw %eax, %k1
     34 ; X32-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
     35 ; X32-NEXT:    popl %eax
     36 ; X32-NEXT:    retl
     37 ;
     38 ; X64-LABEL: test_mm_mask_broadcastd_epi32:
     39 ; X64:       # BB#0:
     40 ; X64-NEXT:    andb $15, %dil
     41 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
     42 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
     43 ; X64-NEXT:    kmovw %eax, %k1
     44 ; X64-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
     45 ; X64-NEXT:    retq
     46   %trn1 = trunc i8 %a1 to i4
     47   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
     48   %arg1 = bitcast i4 %trn1 to <4 x i1>
     49   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
     50   %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <4 x i32> zeroinitializer
     51   %res1 = select <4 x i1> %arg1, <4 x i32> %res0, <4 x i32> %arg0
     52   %res2 = bitcast <4 x i32> %res1 to <2 x i64>
     53   ret <2 x i64> %res2
     54 }
     55 
     56 define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
     57 ; X32-LABEL: test_mm_maskz_broadcastd_epi32:
     58 ; X32:       # BB#0:
     59 ; X32-NEXT:    pushl %eax
     60 ; X32-NEXT:  .Ltmp1:
     61 ; X32-NEXT:    .cfi_def_cfa_offset 8
     62 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
     63 ; X32-NEXT:    andb $15, %al
     64 ; X32-NEXT:    movb %al, (%esp)
     65 ; X32-NEXT:    movzbl (%esp), %eax
     66 ; X32-NEXT:    kmovw %eax, %k1
     67 ; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
     68 ; X32-NEXT:    popl %eax
     69 ; X32-NEXT:    retl
     70 ;
     71 ; X64-LABEL: test_mm_maskz_broadcastd_epi32:
     72 ; X64:       # BB#0:
     73 ; X64-NEXT:    andb $15, %dil
     74 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
     75 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
     76 ; X64-NEXT:    kmovw %eax, %k1
     77 ; X64-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
     78 ; X64-NEXT:    retq
     79   %trn0 = trunc i8 %a0 to i4
     80   %arg0 = bitcast i4 %trn0 to <4 x i1>
     81   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
     82   %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <4 x i32> zeroinitializer
     83   %res1 = select <4 x i1> %arg0, <4 x i32> %res0, <4 x i32> zeroinitializer
     84   %res2 = bitcast <4 x i32> %res1 to <2 x i64>
     85   ret <2 x i64> %res2
     86 }
     87 
     88 define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
     89 ; X32-LABEL: test_mm256_broadcastd_epi32:
     90 ; X32:       # BB#0:
     91 ; X32-NEXT:    vpbroadcastd %xmm0, %ymm0
     92 ; X32-NEXT:    retl
     93 ;
     94 ; X64-LABEL: test_mm256_broadcastd_epi32:
     95 ; X64:       # BB#0:
     96 ; X64-NEXT:    vpbroadcastd %xmm0, %ymm0
     97 ; X64-NEXT:    retq
     98   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
     99   %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
    100   %res1 = bitcast <8 x i32> %res0 to <4 x i64>
    101   ret <4 x i64> %res1
    102 }
    103 
    104 define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
    105 ; X32-LABEL: test_mm256_mask_broadcastd_epi32:
    106 ; X32:       # BB#0:
    107 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    108 ; X32-NEXT:    kmovw %eax, %k1
    109 ; X32-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
    110 ; X32-NEXT:    retl
    111 ;
    112 ; X64-LABEL: test_mm256_mask_broadcastd_epi32:
    113 ; X64:       # BB#0:
    114 ; X64-NEXT:    kmovw %edi, %k1
    115 ; X64-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
    116 ; X64-NEXT:    retq
    117   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    118   %arg1 = bitcast i8 %a1 to <8 x i1>
    119   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
    120   %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
    121   %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
    122   %res2 = bitcast <8 x i32> %res1 to <4 x i64>
    123   ret <4 x i64> %res2
    124 }
    125 
    126 define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
    127 ; X32-LABEL: test_mm256_maskz_broadcastd_epi32:
    128 ; X32:       # BB#0:
    129 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    130 ; X32-NEXT:    kmovw %eax, %k1
    131 ; X32-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
    132 ; X32-NEXT:    retl
    133 ;
    134 ; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
    135 ; X64:       # BB#0:
    136 ; X64-NEXT:    kmovw %edi, %k1
    137 ; X64-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
    138 ; X64-NEXT:    retq
    139   %arg0 = bitcast i8 %a0 to <8 x i1>
    140   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    141   %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
    142   %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
    143   %res2 = bitcast <8 x i32> %res1 to <4 x i64>
    144   ret <4 x i64> %res2
    145 }
    146 
    147 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
    148 ; X32-LABEL: test_mm_broadcastq_epi64:
    149 ; X32:       # BB#0:
    150 ; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
    151 ; X32-NEXT:    retl
    152 ;
    153 ; X64-LABEL: test_mm_broadcastq_epi64:
    154 ; X64:       # BB#0:
    155 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
    156 ; X64-NEXT:    retq
    157   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
    158   ret <2 x i64> %res
    159 }
    160 
    161 define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
    162 ; X32-LABEL: test_mm_mask_broadcastq_epi64:
    163 ; X32:       # BB#0:
    164 ; X32-NEXT:    pushl %eax
    165 ; X32-NEXT:  .Ltmp2:
    166 ; X32-NEXT:    .cfi_def_cfa_offset 8
    167 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    168 ; X32-NEXT:    andb $3, %al
    169 ; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
    170 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    171 ; X32-NEXT:    kmovw %eax, %k1
    172 ; X32-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
    173 ; X32-NEXT:    popl %eax
    174 ; X32-NEXT:    retl
    175 ;
    176 ; X64-LABEL: test_mm_mask_broadcastq_epi64:
    177 ; X64:       # BB#0:
    178 ; X64-NEXT:    andb $3, %dil
    179 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    180 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    181 ; X64-NEXT:    kmovw %eax, %k1
    182 ; X64-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
    183 ; X64-NEXT:    retq
    184   %trn1 = trunc i8 %a1 to i2
    185   %arg1 = bitcast i2 %trn1 to <2 x i1>
    186   %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <2 x i32> zeroinitializer
    187   %res1 = select <2 x i1> %arg1, <2 x i64> %res0, <2 x i64> %a0
    188   ret <2 x i64> %res1
    189 }
    190 
    191 define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
    192 ; X32-LABEL: test_mm_maskz_broadcastq_epi64:
    193 ; X32:       # BB#0:
    194 ; X32-NEXT:    pushl %eax
    195 ; X32-NEXT:  .Ltmp3:
    196 ; X32-NEXT:    .cfi_def_cfa_offset 8
    197 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    198 ; X32-NEXT:    andb $3, %al
    199 ; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
    200 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    201 ; X32-NEXT:    kmovw %eax, %k1
    202 ; X32-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
    203 ; X32-NEXT:    popl %eax
    204 ; X32-NEXT:    retl
    205 ;
    206 ; X64-LABEL: test_mm_maskz_broadcastq_epi64:
    207 ; X64:       # BB#0:
    208 ; X64-NEXT:    andb $3, %dil
    209 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    210 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    211 ; X64-NEXT:    kmovw %eax, %k1
    212 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
    213 ; X64-NEXT:    retq
    214   %trn0 = trunc i8 %a0 to i2
    215   %arg0 = bitcast i2 %trn0 to <2 x i1>
    216   %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> zeroinitializer
    217   %res1 = select <2 x i1> %arg0, <2 x i64> %res0, <2 x i64> zeroinitializer
    218   ret <2 x i64> %res1
    219 }
    220 
    221 define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
    222 ; X32-LABEL: test_mm256_broadcastq_epi64:
    223 ; X32:       # BB#0:
    224 ; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
    225 ; X32-NEXT:    retl
    226 ;
    227 ; X64-LABEL: test_mm256_broadcastq_epi64:
    228 ; X64:       # BB#0:
    229 ; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
    230 ; X64-NEXT:    retq
    231   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
    232   ret <4 x i64> %res
    233 }
    234 
    235 define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
    236 ; X32-LABEL: test_mm256_mask_broadcastq_epi64:
    237 ; X32:       # BB#0:
    238 ; X32-NEXT:    pushl %eax
    239 ; X32-NEXT:  .Ltmp4:
    240 ; X32-NEXT:    .cfi_def_cfa_offset 8
    241 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    242 ; X32-NEXT:    andb $15, %al
    243 ; X32-NEXT:    movb %al, (%esp)
    244 ; X32-NEXT:    movzbl (%esp), %eax
    245 ; X32-NEXT:    kmovw %eax, %k1
    246 ; X32-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
    247 ; X32-NEXT:    popl %eax
    248 ; X32-NEXT:    retl
    249 ;
    250 ; X64-LABEL: test_mm256_mask_broadcastq_epi64:
    251 ; X64:       # BB#0:
    252 ; X64-NEXT:    andb $15, %dil
    253 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    254 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    255 ; X64-NEXT:    kmovw %eax, %k1
    256 ; X64-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
    257 ; X64-NEXT:    retq
    258   %trn1 = trunc i8 %a1 to i4
    259   %arg1 = bitcast i4 %trn1 to <4 x i1>
    260   %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> zeroinitializer
    261   %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
    262   ret <4 x i64> %res1
    263 }
    264 
    265 define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
    266 ; X32-LABEL: test_mm256_maskz_broadcastq_epi64:
    267 ; X32:       # BB#0:
    268 ; X32-NEXT:    pushl %eax
    269 ; X32-NEXT:  .Ltmp5:
    270 ; X32-NEXT:    .cfi_def_cfa_offset 8
    271 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    272 ; X32-NEXT:    andb $15, %al
    273 ; X32-NEXT:    movb %al, (%esp)
    274 ; X32-NEXT:    movzbl (%esp), %eax
    275 ; X32-NEXT:    kmovw %eax, %k1
    276 ; X32-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
    277 ; X32-NEXT:    popl %eax
    278 ; X32-NEXT:    retl
    279 ;
    280 ; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
    281 ; X64:       # BB#0:
    282 ; X64-NEXT:    andb $15, %dil
    283 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    284 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    285 ; X64-NEXT:    kmovw %eax, %k1
    286 ; X64-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
    287 ; X64-NEXT:    retq
    288   %trn0 = trunc i8 %a0 to i4
    289   %arg0 = bitcast i4 %trn0 to <4 x i1>
    290   %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <4 x i32> zeroinitializer
    291   %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
    292   ret <4 x i64> %res1
    293 }
    294 
    295 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
    296 ; X32-LABEL: test_mm_broadcastsd_pd:
    297 ; X32:       # BB#0:
    298 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    299 ; X32-NEXT:    retl
    300 ;
    301 ; X64-LABEL: test_mm_broadcastsd_pd:
    302 ; X64:       # BB#0:
    303 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    304 ; X64-NEXT:    retq
    305   %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
    306   ret <2 x double> %res
    307 }
    308 
    309 define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
    310 ; X32-LABEL: test_mm_mask_broadcastsd_pd:
    311 ; X32:       # BB#0:
    312 ; X32-NEXT:    pushl %eax
    313 ; X32-NEXT:  .Ltmp6:
    314 ; X32-NEXT:    .cfi_def_cfa_offset 8
    315 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    316 ; X32-NEXT:    andb $3, %al
    317 ; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
    318 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    319 ; X32-NEXT:    kmovw %eax, %k1
    320 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
    321 ; X32-NEXT:    popl %eax
    322 ; X32-NEXT:    retl
    323 ;
    324 ; X64-LABEL: test_mm_mask_broadcastsd_pd:
    325 ; X64:       # BB#0:
    326 ; X64-NEXT:    andb $3, %dil
    327 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    328 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    329 ; X64-NEXT:    kmovw %eax, %k1
    330 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
    331 ; X64-NEXT:    retq
    332   %trn1 = trunc i8 %a1 to i2
    333   %arg1 = bitcast i2 %trn1 to <2 x i1>
    334   %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
    335   %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
    336   ret <2 x double> %res1
    337 }
    338 
    339 define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
    340 ; X32-LABEL: test_mm_maskz_broadcastsd_pd:
    341 ; X32:       # BB#0:
    342 ; X32-NEXT:    pushl %eax
    343 ; X32-NEXT:  .Ltmp7:
    344 ; X32-NEXT:    .cfi_def_cfa_offset 8
    345 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    346 ; X32-NEXT:    andb $3, %al
    347 ; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
    348 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    349 ; X32-NEXT:    kmovw %eax, %k1
    350 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
    351 ; X32-NEXT:    popl %eax
    352 ; X32-NEXT:    retl
    353 ;
    354 ; X64-LABEL: test_mm_maskz_broadcastsd_pd:
    355 ; X64:       # BB#0:
    356 ; X64-NEXT:    andb $3, %dil
    357 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    358 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    359 ; X64-NEXT:    kmovw %eax, %k1
    360 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
    361 ; X64-NEXT:    retq
    362   %trn0 = trunc i8 %a0 to i2
    363   %arg0 = bitcast i2 %trn0 to <2 x i1>
    364   %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
    365   %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
    366   ret <2 x double> %res1
    367 }
    368 
    369 define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
    370 ; X32-LABEL: test_mm256_broadcastsd_pd:
    371 ; X32:       # BB#0:
    372 ; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
    373 ; X32-NEXT:    retl
    374 ;
    375 ; X64-LABEL: test_mm256_broadcastsd_pd:
    376 ; X64:       # BB#0:
    377 ; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
    378 ; X64-NEXT:    retq
    379   %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
    380   ret <4 x double> %res
    381 }
    382 
    383 define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) {
    384 ; X32-LABEL: test_mm256_mask_broadcastsd_pd:
    385 ; X32:       # BB#0:
    386 ; X32-NEXT:    pushl %eax
    387 ; X32-NEXT:  .Ltmp8:
    388 ; X32-NEXT:    .cfi_def_cfa_offset 8
    389 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    390 ; X32-NEXT:    andb $15, %al
    391 ; X32-NEXT:    movb %al, (%esp)
    392 ; X32-NEXT:    movzbl (%esp), %eax
    393 ; X32-NEXT:    kmovw %eax, %k1
    394 ; X32-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
    395 ; X32-NEXT:    popl %eax
    396 ; X32-NEXT:    retl
    397 ;
    398 ; X64-LABEL: test_mm256_mask_broadcastsd_pd:
    399 ; X64:       # BB#0:
    400 ; X64-NEXT:    andb $15, %dil
    401 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    402 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    403 ; X64-NEXT:    kmovw %eax, %k1
    404 ; X64-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
    405 ; X64-NEXT:    retq
    406   %trn1 = trunc i8 %a1 to i4
    407   %arg1 = bitcast i4 %trn1 to <4 x i1>
    408   %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <4 x i32> zeroinitializer
    409   %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
    410   ret <4 x double> %res1
    411 }
    412 
    413 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
    414 ; X32-LABEL: test_mm256_maskz_broadcastsd_pd:
    415 ; X32:       # BB#0:
    416 ; X32-NEXT:    pushl %eax
    417 ; X32-NEXT:  .Ltmp9:
    418 ; X32-NEXT:    .cfi_def_cfa_offset 8
    419 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    420 ; X32-NEXT:    andb $15, %al
    421 ; X32-NEXT:    movb %al, (%esp)
    422 ; X32-NEXT:    movzbl (%esp), %eax
    423 ; X32-NEXT:    kmovw %eax, %k1
    424 ; X32-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
    425 ; X32-NEXT:    popl %eax
    426 ; X32-NEXT:    retl
    427 ;
    428 ; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
    429 ; X64:       # BB#0:
    430 ; X64-NEXT:    andb $15, %dil
    431 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    432 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    433 ; X64-NEXT:    kmovw %eax, %k1
    434 ; X64-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
    435 ; X64-NEXT:    retq
    436   %trn0 = trunc i8 %a0 to i4
    437   %arg0 = bitcast i4 %trn0 to <4 x i1>
    438   %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <4 x i32> zeroinitializer
    439   %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
    440   ret <4 x double> %res1
    441 }
    442 
    443 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
    444 ; X32-LABEL: test_mm_broadcastss_ps:
    445 ; X32:       # BB#0:
    446 ; X32-NEXT:    vbroadcastss %xmm0, %xmm0
    447 ; X32-NEXT:    retl
    448 ;
    449 ; X64-LABEL: test_mm_broadcastss_ps:
    450 ; X64:       # BB#0:
    451 ; X64-NEXT:    vbroadcastss %xmm0, %xmm0
    452 ; X64-NEXT:    retq
    453   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
    454   ret <4 x float> %res
    455 }
    456 
    457 define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
    458 ; X32-LABEL: test_mm_mask_broadcastss_ps:
    459 ; X32:       # BB#0:
    460 ; X32-NEXT:    pushl %eax
    461 ; X32-NEXT:  .Ltmp10:
    462 ; X32-NEXT:    .cfi_def_cfa_offset 8
    463 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    464 ; X32-NEXT:    andb $15, %al
    465 ; X32-NEXT:    movb %al, (%esp)
    466 ; X32-NEXT:    movzbl (%esp), %eax
    467 ; X32-NEXT:    kmovw %eax, %k1
    468 ; X32-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
    469 ; X32-NEXT:    popl %eax
    470 ; X32-NEXT:    retl
    471 ;
    472 ; X64-LABEL: test_mm_mask_broadcastss_ps:
    473 ; X64:       # BB#0:
    474 ; X64-NEXT:    andb $15, %dil
    475 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    476 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    477 ; X64-NEXT:    kmovw %eax, %k1
    478 ; X64-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
    479 ; X64-NEXT:    retq
    480   %trn1 = trunc i8 %a1 to i4
    481   %arg1 = bitcast i4 %trn1 to <4 x i1>
    482   %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer
    483   %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
    484   ret <4 x float> %res1
    485 }
    486 
    487 define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
    488 ; X32-LABEL: test_mm_maskz_broadcastss_ps:
    489 ; X32:       # BB#0:
    490 ; X32-NEXT:    pushl %eax
    491 ; X32-NEXT:  .Ltmp11:
    492 ; X32-NEXT:    .cfi_def_cfa_offset 8
    493 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    494 ; X32-NEXT:    andb $15, %al
    495 ; X32-NEXT:    movb %al, (%esp)
    496 ; X32-NEXT:    movzbl (%esp), %eax
    497 ; X32-NEXT:    kmovw %eax, %k1
    498 ; X32-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
    499 ; X32-NEXT:    popl %eax
    500 ; X32-NEXT:    retl
    501 ;
    502 ; X64-LABEL: test_mm_maskz_broadcastss_ps:
    503 ; X64:       # BB#0:
    504 ; X64-NEXT:    andb $15, %dil
    505 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    506 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    507 ; X64-NEXT:    kmovw %eax, %k1
    508 ; X64-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
    509 ; X64-NEXT:    retq
    510   %trn0 = trunc i8 %a0 to i4
    511   %arg0 = bitcast i4 %trn0 to <4 x i1>
    512   %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
    513   %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
    514   ret <4 x float> %res1
    515 }
    516 
    517 define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
    518 ; X32-LABEL: test_mm256_broadcastss_ps:
    519 ; X32:       # BB#0:
    520 ; X32-NEXT:    vbroadcastss %xmm0, %ymm0
    521 ; X32-NEXT:    retl
    522 ;
    523 ; X64-LABEL: test_mm256_broadcastss_ps:
    524 ; X64:       # BB#0:
    525 ; X64-NEXT:    vbroadcastss %xmm0, %ymm0
    526 ; X64-NEXT:    retq
    527   %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
    528   ret <8 x float> %res
    529 }
    530 
    531 define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
    532 ; X32-LABEL: test_mm256_mask_broadcastss_ps:
    533 ; X32:       # BB#0:
    534 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    535 ; X32-NEXT:    kmovw %eax, %k1
    536 ; X32-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
    537 ; X32-NEXT:    retl
    538 ;
    539 ; X64-LABEL: test_mm256_mask_broadcastss_ps:
    540 ; X64:       # BB#0:
    541 ; X64-NEXT:    kmovw %edi, %k1
    542 ; X64-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
    543 ; X64-NEXT:    retq
    544   %arg1 = bitcast i8 %a1 to <8 x i1>
    545   %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
    546   %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
    547   ret <8 x float> %res1
    548 }
    549 
    550 define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
    551 ; X32-LABEL: test_mm256_maskz_broadcastss_ps:
    552 ; X32:       # BB#0:
    553 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    554 ; X32-NEXT:    kmovw %eax, %k1
    555 ; X32-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
    556 ; X32-NEXT:    retl
    557 ;
    558 ; X64-LABEL: test_mm256_maskz_broadcastss_ps:
    559 ; X64:       # BB#0:
    560 ; X64-NEXT:    kmovw %edi, %k1
    561 ; X64-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
    562 ; X64-NEXT:    retq
    563   %arg0 = bitcast i8 %a0 to <8 x i1>
    564   %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
    565   %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
    566   ret <8 x float> %res1
    567 }
    568 
    569 define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
    570 ; X32-LABEL: test_mm_movddup_pd:
    571 ; X32:       # BB#0:
    572 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    573 ; X32-NEXT:    retl
    574 ;
    575 ; X64-LABEL: test_mm_movddup_pd:
    576 ; X64:       # BB#0:
    577 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    578 ; X64-NEXT:    retq
    579   %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
    580   ret <2 x double> %res
    581 }
    582 
    583 define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
    584 ; X32-LABEL: test_mm_mask_movddup_pd:
    585 ; X32:       # BB#0:
    586 ; X32-NEXT:    pushl %eax
    587 ; X32-NEXT:  .Ltmp12:
    588 ; X32-NEXT:    .cfi_def_cfa_offset 8
    589 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    590 ; X32-NEXT:    andb $3, %al
    591 ; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
    592 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    593 ; X32-NEXT:    kmovw %eax, %k1
    594 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
    595 ; X32-NEXT:    popl %eax
    596 ; X32-NEXT:    retl
    597 ;
    598 ; X64-LABEL: test_mm_mask_movddup_pd:
    599 ; X64:       # BB#0:
    600 ; X64-NEXT:    andb $3, %dil
    601 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    602 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    603 ; X64-NEXT:    kmovw %eax, %k1
    604 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
    605 ; X64-NEXT:    retq
    606   %trn1 = trunc i8 %a1 to i2
    607   %arg1 = bitcast i2 %trn1 to <2 x i1>
    608   %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
    609   %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
    610   ret <2 x double> %res1
    611 }
    612 
    613 define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
    614 ; X32-LABEL: test_mm_maskz_movddup_pd:
    615 ; X32:       # BB#0:
    616 ; X32-NEXT:    pushl %eax
    617 ; X32-NEXT:  .Ltmp13:
    618 ; X32-NEXT:    .cfi_def_cfa_offset 8
    619 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    620 ; X32-NEXT:    andb $3, %al
    621 ; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
    622 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    623 ; X32-NEXT:    kmovw %eax, %k1
    624 ; X32-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
    625 ; X32-NEXT:    popl %eax
    626 ; X32-NEXT:    retl
    627 ;
    628 ; X64-LABEL: test_mm_maskz_movddup_pd:
    629 ; X64:       # BB#0:
    630 ; X64-NEXT:    andb $3, %dil
    631 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    632 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    633 ; X64-NEXT:    kmovw %eax, %k1
    634 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
    635 ; X64-NEXT:    retq
    636   %trn1 = trunc i8 %a0 to i2
    637   %arg0 = bitcast i2 %trn1 to <2 x i1>
    638   %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
    639   %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
    640   ret <2 x double> %res1
    641 }
    642 
    643 define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
    644 ; X32-LABEL: test_mm256_movddup_pd:
    645 ; X32:       # BB#0:
    646 ; X32-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    647 ; X32-NEXT:    retl
    648 ;
    649 ; X64-LABEL: test_mm256_movddup_pd:
    650 ; X64:       # BB#0:
    651 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
    652 ; X64-NEXT:    retq
    653   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    654   ret <4 x double> %res
    655 }
    656 
    657 define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
    658 ; X32-LABEL: test_mm256_mask_movddup_pd:
    659 ; X32:       # BB#0:
    660 ; X32-NEXT:    pushl %eax
    661 ; X32-NEXT:  .Ltmp14:
    662 ; X32-NEXT:    .cfi_def_cfa_offset 8
    663 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    664 ; X32-NEXT:    andb $15, %al
    665 ; X32-NEXT:    movb %al, (%esp)
    666 ; X32-NEXT:    movzbl (%esp), %eax
    667 ; X32-NEXT:    kmovw %eax, %k1
    668 ; X32-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
    669 ; X32-NEXT:    popl %eax
    670 ; X32-NEXT:    retl
    671 ;
    672 ; X64-LABEL: test_mm256_mask_movddup_pd:
    673 ; X64:       # BB#0:
    674 ; X64-NEXT:    andb $15, %dil
    675 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    676 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    677 ; X64-NEXT:    kmovw %eax, %k1
    678 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
    679 ; X64-NEXT:    retq
    680   %trn1 = trunc i8 %a1 to i4
    681   %arg1 = bitcast i4 %trn1 to <4 x i1>
    682   %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    683   %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
    684   ret <4 x double> %res1
    685 }
    686 
    687 define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
    688 ; X32-LABEL: test_mm256_maskz_movddup_pd:
    689 ; X32:       # BB#0:
    690 ; X32-NEXT:    pushl %eax
    691 ; X32-NEXT:  .Ltmp15:
    692 ; X32-NEXT:    .cfi_def_cfa_offset 8
    693 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    694 ; X32-NEXT:    andb $15, %al
    695 ; X32-NEXT:    movb %al, (%esp)
    696 ; X32-NEXT:    movzbl (%esp), %eax
    697 ; X32-NEXT:    kmovw %eax, %k1
    698 ; X32-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
    699 ; X32-NEXT:    popl %eax
    700 ; X32-NEXT:    retl
    701 ;
    702 ; X64-LABEL: test_mm256_maskz_movddup_pd:
    703 ; X64:       # BB#0:
    704 ; X64-NEXT:    andb $15, %dil
    705 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    706 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    707 ; X64-NEXT:    kmovw %eax, %k1
    708 ; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
    709 ; X64-NEXT:    retq
    710   %trn1 = trunc i8 %a0 to i4
    711   %arg0 = bitcast i4 %trn1 to <4 x i1>
    712   %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    713   %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
    714   ret <4 x double> %res1
    715 }
    716 
    717 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
    718 ; X32-LABEL: test_mm_movehdup_ps:
    719 ; X32:       # BB#0:
    720 ; X32-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    721 ; X32-NEXT:    retl
    722 ;
    723 ; X64-LABEL: test_mm_movehdup_ps:
    724 ; X64:       # BB#0:
    725 ; X64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    726 ; X64-NEXT:    retq
    727   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
    728   ret <4 x float> %res
    729 }
    730 
    731 define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
    732 ; X32-LABEL: test_mm_mask_movehdup_ps:
    733 ; X32:       # BB#0:
    734 ; X32-NEXT:    pushl %eax
    735 ; X32-NEXT:  .Ltmp16:
    736 ; X32-NEXT:    .cfi_def_cfa_offset 8
    737 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    738 ; X32-NEXT:    andb $15, %al
    739 ; X32-NEXT:    movb %al, (%esp)
    740 ; X32-NEXT:    movzbl (%esp), %eax
    741 ; X32-NEXT:    kmovw %eax, %k1
    742 ; X32-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
    743 ; X32-NEXT:    popl %eax
    744 ; X32-NEXT:    retl
    745 ;
    746 ; X64-LABEL: test_mm_mask_movehdup_ps:
    747 ; X64:       # BB#0:
    748 ; X64-NEXT:    andb $15, %dil
    749 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    750 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    751 ; X64-NEXT:    kmovw %eax, %k1
    752 ; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
    753 ; X64-NEXT:    retq
    754   %trn1 = trunc i8 %a1 to i4
    755   %arg1 = bitcast i4 %trn1 to <4 x i1>
    756   %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
    757   %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
    758   ret <4 x float> %res1
    759 }
    760 
    761 define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
    762 ; X32-LABEL: test_mm_maskz_movehdup_ps:
    763 ; X32:       # BB#0:
    764 ; X32-NEXT:    pushl %eax
    765 ; X32-NEXT:  .Ltmp17:
    766 ; X32-NEXT:    .cfi_def_cfa_offset 8
    767 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    768 ; X32-NEXT:    andb $15, %al
    769 ; X32-NEXT:    movb %al, (%esp)
    770 ; X32-NEXT:    movzbl (%esp), %eax
    771 ; X32-NEXT:    kmovw %eax, %k1
    772 ; X32-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
    773 ; X32-NEXT:    popl %eax
    774 ; X32-NEXT:    retl
    775 ;
    776 ; X64-LABEL: test_mm_maskz_movehdup_ps:
    777 ; X64:       # BB#0:
    778 ; X64-NEXT:    andb $15, %dil
    779 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    780 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    781 ; X64-NEXT:    kmovw %eax, %k1
    782 ; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
    783 ; X64-NEXT:    retq
    784   %trn0 = trunc i8 %a0 to i4
    785   %arg0 = bitcast i4 %trn0 to <4 x i1>
    786   %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
    787   %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
    788   ret <4 x float> %res1
    789 }
    790 
    791 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
    792 ; X32-LABEL: test_mm256_movehdup_ps:
    793 ; X32:       # BB#0:
    794 ; X32-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
    795 ; X32-NEXT:    retl
    796 ;
    797 ; X64-LABEL: test_mm256_movehdup_ps:
    798 ; X64:       # BB#0:
    799 ; X64-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
    800 ; X64-NEXT:    retq
    801   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
    802   ret <8 x float> %res
    803 }
    804 
    805 define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
    806 ; X32-LABEL: test_mm256_mask_movehdup_ps:
    807 ; X32:       # BB#0:
    808 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    809 ; X32-NEXT:    kmovw %eax, %k1
    810 ; X32-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
    811 ; X32-NEXT:    retl
    812 ;
    813 ; X64-LABEL: test_mm256_mask_movehdup_ps:
    814 ; X64:       # BB#0:
    815 ; X64-NEXT:    kmovw %edi, %k1
    816 ; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
    817 ; X64-NEXT:    retq
    818   %arg1 = bitcast i8 %a1 to <8 x i1>
    819   %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
    820   %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
    821   ret <8 x float> %res1
    822 }
    823 
    824 define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
    825 ; X32-LABEL: test_mm256_maskz_movehdup_ps:
    826 ; X32:       # BB#0:
    827 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    828 ; X32-NEXT:    kmovw %eax, %k1
    829 ; X32-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
    830 ; X32-NEXT:    retl
    831 ;
    832 ; X64-LABEL: test_mm256_maskz_movehdup_ps:
    833 ; X64:       # BB#0:
    834 ; X64-NEXT:    kmovw %edi, %k1
    835 ; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
    836 ; X64-NEXT:    retq
    837   %arg0 = bitcast i8 %a0 to <8 x i1>
    838   %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
    839   %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
    840   ret <8 x float> %res1
    841 }
    842 
    843 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
    844 ; X32-LABEL: test_mm_moveldup_ps:
    845 ; X32:       # BB#0:
    846 ; X32-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
    847 ; X32-NEXT:    retl
    848 ;
    849 ; X64-LABEL: test_mm_moveldup_ps:
    850 ; X64:       # BB#0:
    851 ; X64-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
    852 ; X64-NEXT:    retq
    853   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    854   ret <4 x float> %res
    855 }
    856 
    857 define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
    858 ; X32-LABEL: test_mm_mask_moveldup_ps:
    859 ; X32:       # BB#0:
    860 ; X32-NEXT:    pushl %eax
    861 ; X32-NEXT:  .Ltmp18:
    862 ; X32-NEXT:    .cfi_def_cfa_offset 8
    863 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    864 ; X32-NEXT:    andb $15, %al
    865 ; X32-NEXT:    movb %al, (%esp)
    866 ; X32-NEXT:    movzbl (%esp), %eax
    867 ; X32-NEXT:    kmovw %eax, %k1
    868 ; X32-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
    869 ; X32-NEXT:    popl %eax
    870 ; X32-NEXT:    retl
    871 ;
    872 ; X64-LABEL: test_mm_mask_moveldup_ps:
    873 ; X64:       # BB#0:
    874 ; X64-NEXT:    andb $15, %dil
    875 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    876 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    877 ; X64-NEXT:    kmovw %eax, %k1
    878 ; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
    879 ; X64-NEXT:    retq
    880   %trn1 = trunc i8 %a1 to i4
    881   %arg1 = bitcast i4 %trn1 to <4 x i1>
    882   %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    883   %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
    884   ret <4 x float> %res1
    885 }
    886 
    887 define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
    888 ; X32-LABEL: test_mm_maskz_moveldup_ps:
    889 ; X32:       # BB#0:
    890 ; X32-NEXT:    pushl %eax
    891 ; X32-NEXT:  .Ltmp19:
    892 ; X32-NEXT:    .cfi_def_cfa_offset 8
    893 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    894 ; X32-NEXT:    andb $15, %al
    895 ; X32-NEXT:    movb %al, (%esp)
    896 ; X32-NEXT:    movzbl (%esp), %eax
    897 ; X32-NEXT:    kmovw %eax, %k1
    898 ; X32-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
    899 ; X32-NEXT:    popl %eax
    900 ; X32-NEXT:    retl
    901 ;
    902 ; X64-LABEL: test_mm_maskz_moveldup_ps:
    903 ; X64:       # BB#0:
    904 ; X64-NEXT:    andb $15, %dil
    905 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
    906 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    907 ; X64-NEXT:    kmovw %eax, %k1
    908 ; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
    909 ; X64-NEXT:    retq
    910   %trn0 = trunc i8 %a0 to i4
    911   %arg0 = bitcast i4 %trn0 to <4 x i1>
    912   %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
    913   %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
    914   ret <4 x float> %res1
    915 }
    916 
    917 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
    918 ; X32-LABEL: test_mm256_moveldup_ps:
    919 ; X32:       # BB#0:
    920 ; X32-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
    921 ; X32-NEXT:    retl
    922 ;
    923 ; X64-LABEL: test_mm256_moveldup_ps:
    924 ; X64:       # BB#0:
    925 ; X64-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
    926 ; X64-NEXT:    retq
    927   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    928   ret <8 x float> %res
    929 }
    930 
    931 define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
    932 ; X32-LABEL: test_mm256_mask_moveldup_ps:
    933 ; X32:       # BB#0:
    934 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    935 ; X32-NEXT:    kmovw %eax, %k1
    936 ; X32-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
    937 ; X32-NEXT:    retl
    938 ;
    939 ; X64-LABEL: test_mm256_mask_moveldup_ps:
    940 ; X64:       # BB#0:
    941 ; X64-NEXT:    kmovw %edi, %k1
    942 ; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
    943 ; X64-NEXT:    retq
    944   %arg1 = bitcast i8 %a1 to <8 x i1>
    945   %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    946   %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
    947   ret <8 x float> %res1
    948 }
    949 
    950 define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
    951 ; X32-LABEL: test_mm256_maskz_moveldup_ps:
    952 ; X32:       # BB#0:
    953 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    954 ; X32-NEXT:    kmovw %eax, %k1
    955 ; X32-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
    956 ; X32-NEXT:    retl
    957 ;
    958 ; X64-LABEL: test_mm256_maskz_moveldup_ps:
    959 ; X64:       # BB#0:
    960 ; X64-NEXT:    kmovw %edi, %k1
    961 ; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
    962 ; X64-NEXT:    retq
    963   %arg0 = bitcast i8 %a0 to <8 x i1>
    964   %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
    965   %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
    966   ret <8 x float> %res1
    967 }
    968 
    969 define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
    970 ; X32-LABEL: test_mm256_permutex_epi64:
    971 ; X32:       # BB#0:
    972 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
    973 ; X32-NEXT:    retl
    974 ;
    975 ; X64-LABEL: test_mm256_permutex_epi64:
    976 ; X64:       # BB#0:
    977 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
    978 ; X64-NEXT:    retq
    979   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
    980   ret <4 x i64> %res
    981 }
    982 
    983 define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) {
    984 ; X32-LABEL: test_mm256_mask_permutex_epi64:
    985 ; X32:       # BB#0:
    986 ; X32-NEXT:    pushl %eax
    987 ; X32-NEXT:  .Ltmp20:
    988 ; X32-NEXT:    .cfi_def_cfa_offset 8
    989 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
    990 ; X32-NEXT:    andb $15, %al
    991 ; X32-NEXT:    movb %al, (%esp)
    992 ; X32-NEXT:    movzbl (%esp), %eax
    993 ; X32-NEXT:    kmovw %eax, %k1
    994 ; X32-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
    995 ; X32-NEXT:    popl %eax
    996 ; X32-NEXT:    retl
    997 ;
    998 ; X64-LABEL: test_mm256_mask_permutex_epi64:
    999 ; X64:       # BB#0:
   1000 ; X64-NEXT:    andb $15, %dil
   1001 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1002 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1003 ; X64-NEXT:    kmovw %eax, %k1
   1004 ; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
   1005 ; X64-NEXT:    retq
   1006   %trn1 = trunc i8 %a1 to i4
   1007   %arg1 = bitcast i4 %trn1 to <4 x i1>
   1008   %res0 = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   1009   %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
   1010   ret <4 x i64> %res1
   1011 }
   1012 
   1013 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
   1014 ; X32-LABEL: test_mm256_maskz_permutex_epi64:
   1015 ; X32:       # BB#0:
   1016 ; X32-NEXT:    pushl %eax
   1017 ; X32-NEXT:  .Ltmp21:
   1018 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1019 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1020 ; X32-NEXT:    andb $15, %al
   1021 ; X32-NEXT:    movb %al, (%esp)
   1022 ; X32-NEXT:    movzbl (%esp), %eax
   1023 ; X32-NEXT:    kmovw %eax, %k1
   1024 ; X32-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
   1025 ; X32-NEXT:    popl %eax
   1026 ; X32-NEXT:    retl
   1027 ;
   1028 ; X64-LABEL: test_mm256_maskz_permutex_epi64:
   1029 ; X64:       # BB#0:
   1030 ; X64-NEXT:    andb $15, %dil
   1031 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1032 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1033 ; X64-NEXT:    kmovw %eax, %k1
   1034 ; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
   1035 ; X64-NEXT:    retq
   1036   %trn1 = trunc i8 %a0 to i4
   1037   %arg0 = bitcast i4 %trn1 to <4 x i1>
   1038   %res0 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   1039   %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
   1040   ret <4 x i64> %res1
   1041 }
   1042 
   1043 define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
   1044 ; X32-LABEL: test_mm256_permutex_pd:
   1045 ; X32:       # BB#0:
   1046 ; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
   1047 ; X32-NEXT:    retl
   1048 ;
   1049 ; X64-LABEL: test_mm256_permutex_pd:
   1050 ; X64:       # BB#0:
   1051 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
   1052 ; X64-NEXT:    retq
   1053   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   1054   ret <4 x double> %res
   1055 }
   1056 
   1057 define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
   1058 ; X32-LABEL: test_mm256_mask_permutex_pd:
   1059 ; X32:       # BB#0:
   1060 ; X32-NEXT:    pushl %eax
   1061 ; X32-NEXT:  .Ltmp22:
   1062 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1063 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1064 ; X32-NEXT:    andb $15, %al
   1065 ; X32-NEXT:    movb %al, (%esp)
   1066 ; X32-NEXT:    movzbl (%esp), %eax
   1067 ; X32-NEXT:    kmovw %eax, %k1
   1068 ; X32-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
   1069 ; X32-NEXT:    popl %eax
   1070 ; X32-NEXT:    retl
   1071 ;
   1072 ; X64-LABEL: test_mm256_mask_permutex_pd:
   1073 ; X64:       # BB#0:
   1074 ; X64-NEXT:    andb $15, %dil
   1075 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1076 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1077 ; X64-NEXT:    kmovw %eax, %k1
   1078 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
   1079 ; X64-NEXT:    retq
   1080   %trn1 = trunc i8 %a1 to i4
   1081   %arg1 = bitcast i4 %trn1 to <4 x i1>
   1082   %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   1083   %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
   1084   ret <4 x double> %res1
   1085 }
   1086 
   1087 define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
   1088 ; X32-LABEL: test_mm256_maskz_permutex_pd:
   1089 ; X32:       # BB#0:
   1090 ; X32-NEXT:    pushl %eax
   1091 ; X32-NEXT:  .Ltmp23:
   1092 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1093 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1094 ; X32-NEXT:    andb $15, %al
   1095 ; X32-NEXT:    movb %al, (%esp)
   1096 ; X32-NEXT:    movzbl (%esp), %eax
   1097 ; X32-NEXT:    kmovw %eax, %k1
   1098 ; X32-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
   1099 ; X32-NEXT:    popl %eax
   1100 ; X32-NEXT:    retl
   1101 ;
   1102 ; X64-LABEL: test_mm256_maskz_permutex_pd:
   1103 ; X64:       # BB#0:
   1104 ; X64-NEXT:    andb $15, %dil
   1105 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1106 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1107 ; X64-NEXT:    kmovw %eax, %k1
   1108 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
   1109 ; X64-NEXT:    retq
   1110   %trn1 = trunc i8 %a0 to i4
   1111   %arg0 = bitcast i4 %trn1 to <4 x i1>
   1112   %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   1113   %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
   1114   ret <4 x double> %res1
   1115 }
   1116 
   1117 define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
   1118 ; X32-LABEL: test_mm_shuffle_pd:
   1119 ; X32:       # BB#0:
   1120 ; X32-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1121 ; X32-NEXT:    retl
   1122 ;
   1123 ; X64-LABEL: test_mm_shuffle_pd:
   1124 ; X64:       # BB#0:
   1125 ; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
   1126 ; X64-NEXT:    retq
   1127   %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
   1128   ret <2 x double> %res
   1129 }
   1130 
   1131 define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) {
   1132 ; X32-LABEL: test_mm_mask_shuffle_pd:
   1133 ; X32:       # BB#0:
   1134 ; X32-NEXT:    pushl %eax
   1135 ; X32-NEXT:  .Ltmp24:
   1136 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1137 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1138 ; X32-NEXT:    andb $3, %al
   1139 ; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
   1140 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1141 ; X32-NEXT:    kmovw %eax, %k1
   1142 ; X32-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
   1143 ; X32-NEXT:    popl %eax
   1144 ; X32-NEXT:    retl
   1145 ;
   1146 ; X64-LABEL: test_mm_mask_shuffle_pd:
   1147 ; X64:       # BB#0:
   1148 ; X64-NEXT:    andb $3, %dil
   1149 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1150 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1151 ; X64-NEXT:    kmovw %eax, %k1
   1152 ; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
   1153 ; X64-NEXT:    retq
   1154   %trn1 = trunc i8 %a1 to i2
   1155   %arg1 = bitcast i2 %trn1 to <2 x i1>
   1156   %res0 = shufflevector <2 x double> %a2, <2 x double> %a3, <2 x i32> <i32 1, i32 3>
   1157   %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
   1158   ret <2 x double> %res1
   1159 }
   1160 
   1161 define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
   1162 ; X32-LABEL: test_mm_maskz_shuffle_pd:
   1163 ; X32:       # BB#0:
   1164 ; X32-NEXT:    pushl %eax
   1165 ; X32-NEXT:  .Ltmp25:
   1166 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1167 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1168 ; X32-NEXT:    andb $3, %al
   1169 ; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
   1170 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1171 ; X32-NEXT:    kmovw %eax, %k1
   1172 ; X32-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
   1173 ; X32-NEXT:    popl %eax
   1174 ; X32-NEXT:    retl
   1175 ;
   1176 ; X64-LABEL: test_mm_maskz_shuffle_pd:
   1177 ; X64:       # BB#0:
   1178 ; X64-NEXT:    andb $3, %dil
   1179 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1180 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1181 ; X64-NEXT:    kmovw %eax, %k1
   1182 ; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
   1183 ; X64-NEXT:    retq
   1184   %trn1 = trunc i8 %a0 to i2
   1185   %arg0 = bitcast i2 %trn1 to <2 x i1>
   1186   %res0 = shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 1, i32 3>
   1187   %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
   1188   ret <2 x double> %res1
   1189 }
   1190 
   1191 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
   1192 ; X32-LABEL: test_mm256_shuffle_pd:
   1193 ; X32:       # BB#0:
   1194 ; X32-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
   1195 ; X32-NEXT:    retl
   1196 ;
   1197 ; X64-LABEL: test_mm256_shuffle_pd:
   1198 ; X64:       # BB#0:
   1199 ; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
   1200 ; X64-NEXT:    retq
   1201   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
   1202   ret <4 x double> %res
   1203 }
   1204 
   1205 define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) {
   1206 ; X32-LABEL: test_mm256_mask_shuffle_pd:
   1207 ; X32:       # BB#0:
   1208 ; X32-NEXT:    pushl %eax
   1209 ; X32-NEXT:  .Ltmp26:
   1210 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1211 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1212 ; X32-NEXT:    andb $15, %al
   1213 ; X32-NEXT:    movb %al, (%esp)
   1214 ; X32-NEXT:    movzbl (%esp), %eax
   1215 ; X32-NEXT:    kmovw %eax, %k1
   1216 ; X32-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
   1217 ; X32-NEXT:    popl %eax
   1218 ; X32-NEXT:    retl
   1219 ;
   1220 ; X64-LABEL: test_mm256_mask_shuffle_pd:
   1221 ; X64:       # BB#0:
   1222 ; X64-NEXT:    andb $15, %dil
   1223 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1224 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1225 ; X64-NEXT:    kmovw %eax, %k1
   1226 ; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
   1227 ; X64-NEXT:    retq
   1228   %trn1 = trunc i8 %a1 to i4
   1229   %arg1 = bitcast i4 %trn1 to <4 x i1>
   1230   %res0 = shufflevector <4 x double> %a2, <4 x double> %a3, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
   1231   %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
   1232   ret <4 x double> %res1
   1233 }
   1234 
   1235 define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
   1236 ; X32-LABEL: test_mm256_maskz_shuffle_pd:
   1237 ; X32:       # BB#0:
   1238 ; X32-NEXT:    pushl %eax
   1239 ; X32-NEXT:  .Ltmp27:
   1240 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1241 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1242 ; X32-NEXT:    andb $15, %al
   1243 ; X32-NEXT:    movb %al, (%esp)
   1244 ; X32-NEXT:    movzbl (%esp), %eax
   1245 ; X32-NEXT:    kmovw %eax, %k1
   1246 ; X32-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
   1247 ; X32-NEXT:    popl %eax
   1248 ; X32-NEXT:    retl
   1249 ;
   1250 ; X64-LABEL: test_mm256_maskz_shuffle_pd:
   1251 ; X64:       # BB#0:
   1252 ; X64-NEXT:    andb $15, %dil
   1253 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1254 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1255 ; X64-NEXT:    kmovw %eax, %k1
   1256 ; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
   1257 ; X64-NEXT:    retq
   1258   %trn1 = trunc i8 %a0 to i4
   1259   %arg0 = bitcast i4 %trn1 to <4 x i1>
   1260   %res0 = shufflevector <4 x double> %a1, <4 x double> %a2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
   1261   %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
   1262   ret <4 x double> %res1
   1263 }
   1264 
   1265 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
   1266 ; X32-LABEL: test_mm_shuffle_ps:
   1267 ; X32:       # BB#0:
   1268 ; X32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
   1269 ; X32-NEXT:    retl
   1270 ;
   1271 ; X64-LABEL: test_mm_shuffle_ps:
   1272 ; X64:       # BB#0:
   1273 ; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
   1274 ; X64-NEXT:    retq
   1275   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
   1276   ret <4 x float> %res
   1277 }
   1278 
   1279 define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) {
   1280 ; X32-LABEL: test_mm_mask_shuffle_ps:
   1281 ; X32:       # BB#0:
   1282 ; X32-NEXT:    pushl %eax
   1283 ; X32-NEXT:  .Ltmp28:
   1284 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1285 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1286 ; X32-NEXT:    andb $15, %al
   1287 ; X32-NEXT:    movb %al, (%esp)
   1288 ; X32-NEXT:    movzbl (%esp), %eax
   1289 ; X32-NEXT:    kmovw %eax, %k1
   1290 ; X32-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
   1291 ; X32-NEXT:    popl %eax
   1292 ; X32-NEXT:    retl
   1293 ;
   1294 ; X64-LABEL: test_mm_mask_shuffle_ps:
   1295 ; X64:       # BB#0:
   1296 ; X64-NEXT:    andb $15, %dil
   1297 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1298 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1299 ; X64-NEXT:    kmovw %eax, %k1
   1300 ; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
   1301 ; X64-NEXT:    retq
   1302   %trn1 = trunc i8 %a1 to i4
   1303   %arg1 = bitcast i4 %trn1 to <4 x i1>
   1304   %res0 = shufflevector <4 x float> %a2, <4 x float> %a3, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
   1305   %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
   1306   ret <4 x float> %res1
   1307 }
   1308 
   1309 define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
   1310 ; X32-LABEL: test_mm_maskz_shuffle_ps:
   1311 ; X32:       # BB#0:
   1312 ; X32-NEXT:    pushl %eax
   1313 ; X32-NEXT:  .Ltmp29:
   1314 ; X32-NEXT:    .cfi_def_cfa_offset 8
   1315 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1316 ; X32-NEXT:    andb $15, %al
   1317 ; X32-NEXT:    movb %al, (%esp)
   1318 ; X32-NEXT:    movzbl (%esp), %eax
   1319 ; X32-NEXT:    kmovw %eax, %k1
   1320 ; X32-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
   1321 ; X32-NEXT:    popl %eax
   1322 ; X32-NEXT:    retl
   1323 ;
   1324 ; X64-LABEL: test_mm_maskz_shuffle_ps:
   1325 ; X64:       # BB#0:
   1326 ; X64-NEXT:    andb $15, %dil
   1327 ; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
   1328 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   1329 ; X64-NEXT:    kmovw %eax, %k1
   1330 ; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
   1331 ; X64-NEXT:    retq
   1332   %trn0 = trunc i8 %a0 to i4
   1333   %arg0 = bitcast i4 %trn0 to <4 x i1>
   1334   %res0 = shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
   1335   %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
   1336   ret <4 x float> %res1
   1337 }
   1338 
   1339 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
   1340 ; X32-LABEL: test_mm256_shuffle_ps:
   1341 ; X32:       # BB#0:
   1342 ; X32-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
   1343 ; X32-NEXT:    retl
   1344 ;
   1345 ; X64-LABEL: test_mm256_shuffle_ps:
   1346 ; X64:       # BB#0:
   1347 ; X64-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
   1348 ; X64-NEXT:    retq
   1349   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
   1350   ret <8 x float> %res
   1351 }
   1352 
   1353 define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
   1354 ; X32-LABEL: test_mm256_mask_shuffle_ps:
   1355 ; X32:       # BB#0:
   1356 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1357 ; X32-NEXT:    kmovw %eax, %k1
   1358 ; X32-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
   1359 ; X32-NEXT:    retl
   1360 ;
   1361 ; X64-LABEL: test_mm256_mask_shuffle_ps:
   1362 ; X64:       # BB#0:
   1363 ; X64-NEXT:    kmovw %edi, %k1
   1364 ; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
   1365 ; X64-NEXT:    retq
   1366   %arg1 = bitcast i8 %a1 to <8 x i1>
   1367   %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
   1368   %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
   1369   ret <8 x float> %res1
   1370 }
   1371 
   1372 define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
   1373 ; X32-LABEL: test_mm256_maskz_shuffle_ps:
   1374 ; X32:       # BB#0:
   1375 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
   1376 ; X32-NEXT:    kmovw %eax, %k1
   1377 ; X32-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
   1378 ; X32-NEXT:    retl
   1379 ;
   1380 ; X64-LABEL: test_mm256_maskz_shuffle_ps:
   1381 ; X64:       # BB#0:
   1382 ; X64-NEXT:    kmovw %edi, %k1
   1383 ; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
   1384 ; X64-NEXT:    retq
   1385   %arg0 = bitcast i8 %a0 to <8 x i1>
   1386   %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
   1387   %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
   1388   ret <8 x float> %res1
   1389 }
   1390 
   1391 !0 = !{i32 1}
   1392